In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [5]:
df = pd.read_csv("../data/spanish_movies.csv", sep='|')

In [6]:
df.head()

Unnamed: 0,opinion_rating,opinion_text
0,0,"Una película clásica en la enfermedad, muy ori..."
1,0,Un científico obsesionado con resucitar cadáve...
2,0,Si Joaquin Phoenix no ganó el Oscar como mejor...
3,0,Es lo primero que se me ocurre tras el visiona...
4,0,Esta es posiblemente la crítica más especial y...


In [7]:
df.opinion_rating.value_counts()

0    30096
1    12439
Name: opinion_rating, dtype: int64

In [35]:
from collections import Counter
symbols = Counter(" ".join(data).lower())

In [37]:
s

[(' ', 6934049),
 ('e', 4371669),
 ('a', 3821281),
 ('o', 2702937),
 ('s', 2383212),
 ('n', 2344930),
 ('r', 2043543),
 ('i', 1864459),
 ('l', 1798621),
 ('t', 1499785),
 ('d', 1480268),
 ('u', 1448860),
 ('c', 1402573),
 ('m', 981577),
 ('p', 920327),
 (',', 443727),
 ('b', 382988),
 ('g', 375792),
 ('q', 374319),
 ('y', 342879),
 ('.', 341679),
 ('h', 315426),
 ('v', 311855),
 ('f', 239165),
 ('í', 206896),
 ('j', 154030),
 ('ó', 148134),
 ('á', 142991),
 ('z', 97669),
 ('é', 81700),
 ('x', 51695),
 ('"', 48676),
 ('ñ', 44987),
 (')', 37227),
 ('(', 36573),
 ('ú', 33599),
 ('k', 31086),
 ('-', 23520),
 ('w', 22977),
 (':', 22840),
 ('0', 18811),
 ('1', 16287),
 ('!', 15042),
 ('?', 12924),
 ('/', 12177),
 ('“', 11370),
 ('”', 10981),
 ('2', 10797),
 ('¿', 9527),
 (';', 8978),
 ("'", 8356),
 ('9', 7627),
 ('3', 5706),
 ('5', 5306),
 ('¡', 4425),
 ('4', 4333),
 ('8', 4132),
 ('7', 3689),
 ('6', 3568),
 ('…', 3337),
 ('’', 1570),
 ('‘', 1349),
 ('*', 1117),
 ('ü', 1068),
 ('–', 865),
 (

In [38]:
s = [(k, symbols[k]) for k in sorted(symbols, key=symbols.get, reverse=True)]
res = []
for k, v in s:
    if v > 1000:
        res.append(k)
res

[' ',
 'e',
 'a',
 'o',
 's',
 'n',
 'r',
 'i',
 'l',
 't',
 'd',
 'u',
 'c',
 'm',
 'p',
 ',',
 'b',
 'g',
 'q',
 'y',
 '.',
 'h',
 'v',
 'f',
 'í',
 'j',
 'ó',
 'á',
 'z',
 'é',
 'x',
 '"',
 'ñ',
 ')',
 '(',
 'ú',
 'k',
 '-',
 'w',
 ':',
 '0',
 '1',
 '!',
 '?',
 '/',
 '“',
 '”',
 '2',
 '¿',
 ';',
 "'",
 '9',
 '3',
 '5',
 '¡',
 '4',
 '8',
 '7',
 '6',
 '…',
 '’',
 '‘',
 '*',
 'ü']

In [42]:
np.percentile([len(i) for i in data], q=70)

1143.0

In [8]:
data = df.opinion_text
labels = df.opinion_rating
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

In [9]:
data_train, data_val, labels_train, labels_val = \
    train_test_split(data, np.asarray(labels, dtype='int8'),
                     test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=labels)\

In [31]:
pipeline_lr = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1,7), max_features=1000000, analyzer='char')),
                     ('clf_lr', LogisticRegression(C=100, n_jobs=-1))])

In [32]:
pipeline_lr.fit(data_train, labels_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
predicted_labels = pipeline_lr.predict(data_val)

In [30]:
f1_score(labels_val, predicted_labels)

0.85177453027139871