In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
TEXT_DATA_FILE = "../data/spanish_movies.csv"
HEADER = True

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

def load_data():
    x = []
    y = []
    with open(TEXT_DATA_FILE, "r") as f:
        if HEADER:
            _ = next(f)
        for line in f:
            temp_y, temp_x = line.rstrip("\n").split("|", 1)
            x.append(temp_x.lower())
            y.append(int(temp_y))

    return x, y

data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data,
                                                                  np.asarray(labels, dtype='int8'),
                                                                  test_size=VALIDATION_SPLIT,
                                                                  random_state=RANDOM_SEED,
                                                                  stratify=labels)

In [3]:
pipeline_lr = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1,7), max_features=1000000, analyzer='char')),
                     ('clf_lr', LogisticRegression(C=100, n_jobs=-1))])

In [4]:
pipeline_lr.fit(data_train, labels_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [5]:
predicted_labels = pipeline_lr.predict(data_val)

In [6]:
f1_score(labels_val, predicted_labels)

0.85083333333333333

In [7]:
import pickle

In [8]:
pickle.dump(pipeline_lr, open("../models/spanish_linear_model.pkl", 'wb'))