In [None]:
!wget -q -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
import tarfile

with tarfile.open('aclImdb_v1.tar.gz','r') as tar:
  tar.extractall()

In [None]:
pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
import pyprind
import pandas as pd
import os

In [None]:
basepath = 'aclImdb'

labels = {'pos':1, 'neg':0}

In [None]:
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
  for l in ('pos','neg'):
    path = os.path.join(basepath, s, l)
    for file in sorted(os.listdir(path)):
      with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
        txt = infile.read()
      df = pd.concat([df, pd.DataFrame([txt, labels[l]]).T], ignore_index=True, axis=0)
      pbar.update()

df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:13


In [None]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [None]:
df.shape

(50000, 2)

## Vectorización

### Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet',
                 'and one and one is two'])

bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [None]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [0 2 0 1 1 1 2 0 1]
 [2 1 2 0 0 0 0 1 0]]


### TfIdf

$$tf-idf(t,d) = tf(t,d)\times idf(t,d)$$

El componente $tf$ hace referencia a lo visto anteriormente, mientras que el $idf$ se refiere a

$$idf(t,d)=\log \frac{n_d}{1+df(d,t)}$$

donde $n_d$ es el número total de documentos y $df(d,t)$ es el número de documentos $d$ que contienen el término $t$.

`scikit-learn` posee un método que toma una vectorización del tipo bag of words y la trasnforma en $tf-idf$. No obstsante, si se hiciera a mano el resultado sería diferente ya que la ecuación es ligeramente diferente.

$$idf(t,d)=\log\frac{1+n_d}{1+df(d,t)}$$

Es usual normalizar las frecuencias de los términos antes de llevar a cabo una vectorización $tf-idf$. Sin embargo, el método de `scikit-learn` incorpora la normalización $\mathcal{l}_2$.
$$v_{norm}=\frac{v}{||v||_2}$$

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.         0.37632116 0.         0.56855566 0.56855566 0.
  0.46029481 0.         0.        ]
 [0.         0.37632116 0.         0.         0.         0.56855566
  0.46029481 0.         0.56855566]
 [0.         0.4574528  0.         0.3455657  0.3455657  0.3455657
  0.55953044 0.         0.3455657 ]
 [0.65680405 0.1713738  0.65680405 0.         0.         0.
  0.         0.32840203 0.        ]]


In [None]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

## Limpieza de texto

### Caracteres anormales con expresiones regulares

In [None]:
import re
def preprocessor(text):
  text = re.sub('<[^>]*>','',text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ',text.lower())+ ' '.join(emoticons).replace('-',''))
  return text

In [None]:
preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [None]:
df['review'] = df['review'].apply(preprocessor)

### Tokenización

In [None]:
def tokenizer(text):
  return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

## Stemming

En este caso empleamos stemming, dado que es menos costoso computacionalmente.

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

### Stopwords

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### Aprendizaje no supervisado: LatentDirichletAllocation

Este es un modelo probabilístico generativo que intenta encontrar grupos de palabras que aparecen juntas frecuentemente en diferentes documentos. El input del LDA es un bag-of-words, a partir de allí el LDA descompone la matriz en dos nuevas matrices:    

- Una matriz documento a tópico
- Una matriz palabra a tópico

Si se multiplicasen estas matrices se obtendría el input con el menor margen de error posible. Existe un problema: el hiperparámetro, número de tópicos, debe seleccionarse a priori.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                        max_df=.1,
                        max_features=5000)
X = count.fit_transform(df['review'].values)

Los hiperparámetros aquí observados hacen referencia a la frecuencia mínima permitida y las primeras palabras ordenadas de manera descendente de la más frecuente a la menos frecuente.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

(10, 5000)

In [None]:
type(count)

In [None]:
n_top_words = 10
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
  print(f'Topic: {topic_idx + 1}')
  print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

Topic: 1
horror original comedy black house version classic genre white night
Topic: 2
worst minutes guy script money boring stupid waste wasn comedy
Topic: 3
book dvd read version watched original video remember tv novel
Topic: 4
family performance father beautiful mother woman role lives true performances
Topic: 5
series episode tv kids comedy episodes shows family fun season
Topic: 6
murder police wife john plays crime woman role thriller town
Topic: 7
documentary camera effects audience sense human use style art shot
Topic: 8
music song songs musical role dance rock performance star dancing
Topic: 9
horror effects guy dead budget gore killer special blood looks
Topic: 10
action war game fight american japanese hero battle animation fighting


In [None]:
bad_movie = X_topics[:, 1].argsort()[::-1]
for iter_idx, movie_idx in enumerate(bad_movie[:3]):
  print(f'Bad movie {iter_idx+1}')
  print(df['review'][movie_idx][:300],'...')

Bad movie 1
i have to admit when i went to see this movie i didn t really have high expectations but even with my low expectations i was totally and utterly disappointed basically luke wilson is a hot shot who tends to go out with slightly crazy girlfriends there s slight mention of a girl stalking him but that ...
Bad movie 2
larry bishop directs writes and leads this soft core porn plot less biker movie about nothing to do with anything to call this one of the worst movies of 2008 is being kind to the garbage that i spent money on while in theaters its one of the worst movies i have ever seen i felt sorry for the girls  ...
Bad movie 3
okay so i found out about this movie and i watched the preview read almost all the reviews and was having a hard time debating whether i should watch it or not before i even watched the movie i was emotionally weird on it i was so unsure if i was going to watch this and be disturbed for like a long  ...


## Aprendizaje supervisado: regresión logística para clasificar texto

In [None]:
X_train = df.loc[:35000, 'review'].values
y_train = df.loc[:35000, 'sentiment'].values
X_test = df.loc[35000:,'review'].values
y_test = df.loc[35000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop,None],
               'vect__tokenizer':[tokenizer,
                                  tokenizer_porter],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]},
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]}]

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf', LogisticRegression(random_state=42,
                                                solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=2,
                          n_jobs=1)

In [None]:
gs_lr_tfidf.fit(X_train,y_train)

In [None]:
#print(f'Mejores parámetros: {gs_lr_tfidf.best_params_}')

In [None]:
#print(f'CV accuracy: {gs_lr_tfidf.best_score_}')


In [None]:
#cls = gs_lr_tfidf.best_estimator_
#print(f'Test accuracy {cls.score(X_test, y_test)}')

In [None]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        stop_words=None,
                        tokenizer=tokenizer,
                        ngram_range=(1,1))

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf', LogisticRegression(random_state=42,
                                                solver='liblinear',
                                                C=10,
                                                penalty='l2'
                                                ))])

In [None]:
lr_tfidf.fit(X_train,y_train)



In [None]:
lr_tfidf.score(X_test,y_test)

In [None]:
lr_tfidf.predict(np.array(["This is bullshit"]))

array([0])

In [None]:
type(X_test[0])

In [None]:
y_test[:10]

In [None]:
import pickle

In [None]:
with open("model.pickle", "wb") as f:
  pickle.dump(lr_tfidf, f)