# Construcción del modelo y persistencia del modelo 

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from preprocessor import Preprocessor

from joblib import dump

# read the data
data=pd.read_csv('data/MovieReviews.csv', sep=',', encoding = 'utf-8')
data_t=data

# define functions for preprocessing
stop_words = set(stopwords.words('spanish'))


tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=None)

X_train, X_test, y_train, y_test = train_test_split(data['review_es'], data['sentimiento'], test_size=0.2, random_state=42)

preprocessor = Preprocessor()

# create a pipeline with the preprocessor, vectorizer, and the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('tfidf', tfidf_vectorizer),
    ('model', SVC())
])

pipeline.set_params(model__kernel='linear', model__C=1)

# train the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# evaluate the model
print('Precision: ', precision_score(y_test, y_pred, pos_label='positivo'))
print('Recall: ', recall_score(y_test, y_pred, pos_label='positivo'))
print('F1: ', f1_score(y_test, y_pred, pos_label='positivo'))

# save the model
dump(pipeline, 'assets/modelo.joblib')

Precision:  0.8014705882352942
Recall:  0.872
F1:  0.8352490421455938


['assets/modelo.joblib']

In [3]:
# export MovieReviews.csv preprocessed data to csv
data_t['review_es'] = pipeline.named_steps['preprocessor'].transform(data_t['review_es'])
data_t.to_csv('data/MovieReviews_preprocessed.csv', index=False)