In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("../data/spanish_corpus.csv", sep='|')

In [3]:
df.head()

Unnamed: 0,opinion_rating,opinion_text
0,0,"Una película clásica en la enfermedad, muy ori..."
1,0,"El chambara, acá cine de samuráis, es un géner..."
2,0,De lo mejorcito que he visto de Shamalayan. Su...
3,0,"violenta, cruda, excelente direccion, actuacio..."
4,0,Una gran obra maestra de Zack Snyder: Muy buen...


In [4]:
df.opinion_rating.value_counts()

0    12927
1     4281
Name: opinion_rating, dtype: int64

In [13]:
df1 = pd.read_csv("../data/all.csv", encoding='utf-8')

In [17]:
df1.head()

Unnamed: 0,sentiment,review
0,1.0,Hay veces en las que no me puedo callar. Esta ...
1,1.0,Demasiada falsedad. La pelicula presume de aut...
2,1.0,...de poner verde a Keira Knightley. Lo siento...
3,1.0,"El reparto es bueno, el argumento tambien, per..."
4,1.0,Sencillamente pésima. Se piensan que el cine a...


In [18]:
df1 = df1.rename(columns={'sentiment': 'opinion_rating', 'review': 'opinion_text'})

In [20]:
df1.head()

Unnamed: 0,opinion_rating,opinion_text
0,1.0,Hay veces en las que no me puedo callar. Esta ...
1,1.0,Demasiada falsedad. La pelicula presume de aut...
2,1.0,...de poner verde a Keira Knightley. Lo siento...
3,1.0,"El reparto es bueno, el argumento tambien, per..."
4,1.0,Sencillamente pésima. Se piensan que el cine a...


In [22]:
tips = []
for index, row in df1.iterrows():
    if row['opinion_rating'] <5:
        tips.append((1, row['opinion_text']))
    if row['opinion_rating'] >6:
        tips.append((0, row['opinion_text']))
df2 = pd.DataFrame(tips)
df2 = df2.sort_values(by=0)


In [23]:
df2.head()

Unnamed: 0,0,1
12663,0,Me resistí durate años a ver esta serie porque...
16889,0,Cuando la ausencia de palabras no entra dentro...
16888,0,Sobretodo La Tribu es uno de los ejercicios fí...
16887,0,"The Tribe, de Miroslav Slaboshpitsky, es un dr..."
16886,0,"He escrito esta crítica sin leer ninguna otra,..."


In [25]:
df2 = df2.rename(columns={0: 'opinion_rating', 1: 'opinion_text'})

In [26]:
merged = pd.concat([df,df2])

In [27]:
merged = merged.sort_values('opinion_rating')

In [28]:
merged['opinion_rating'].value_counts()

0    30096
1    12439
Name: opinion_rating, dtype: int64

In [29]:
data = merged.opinion_text
labels = merged.opinion_rating

In [30]:
data_train, data_val, labels_train, labels_val = \
    train_test_split(data, np.asarray(labels, dtype='int8'),
                     test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=labels)\

In [31]:
pipeline_lr = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,5), min_df=4, max_features=1000000)),
                     ('clf_lr', LogisticRegression(n_jobs=-1))])

In [32]:
pipeline_lr.fit(data_train, labels_train)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=4,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
predicted_labels = pipeline_lr.predict(data_val)

In [34]:
f1_score(labels_val, predicted_labels)

0.83522012578616367

In [37]:
def to_csv(frame,file):
    frame.to_csv(file, sep='|', index=False, header=['opinion_rating', 'opinion_text'])

In [38]:
to_csv(merged, '../data/spanish_movies.csv')