The objective of this notebook is to create a sentiment analysis model that works directly in Spanish.  We will try different models and an ensemble of them to see which one works better. To train the models we will use data taken from the TASS webpage (http://tass.sepln.org/), it contains multiple datasets but we have combined all of them into one dataset.


In [1]:
import pandas as pd
import numpy as np

import re
import string
import pickle


from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer


import spacy
from es_lemmatizer import lemmatize

import gensim.parsing.preprocessing as gsp
from gensim import utils

from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
path0 = "D:/sentiment_analysis/"
tweets_corpus = pd.read_csv(path0 + "TASS_5_full.csv", index_col=0)

In [3]:
tweets_corpus.head()

Unnamed: 0,text,polarity,polarty_val
0,@PauladeLasHeras No te libraras de ayudar me/n...,NEU,0
1,@marodriguezb Gracias MAR,P,1
2,"Off pensando en el regalito Sinde, la que se v...",N+,-2
3,Conozco a alguien q es adicto al drama! Ja ja ...,P+,2
4,Toca @crackoviadeTV3 . Grabación dl especial N...,P+,2


As it can be seen the dataset has 5 categories: very positive (P+), positive(P), neutral(NEU), negative(N) and very negative(N+). Since our intention is just to train a model with positive, negative and neutral, we will create three new columns. The first one contains very positive and positive(1), very negative and negative(-1), and neutral tweets(0). The second column tags with a one all negative and very negative tweets, while tags with 0 the rest. The third column does the same as the second one but the (very)positive tweets are the ones tagged with a 1.

In [4]:
def create_cat_columns(tweets_corpus, choices, name_res_column):
    conditions = [
        (tweets_corpus["polarity"]  == 'P+'),
        (tweets_corpus["polarity"]  == 'P'),
        (tweets_corpus["polarity"]  == 'NEU'),
        (tweets_corpus["polarity"]  == 'N'),
        (tweets_corpus["polarity"]  == 'N+')]
    tweets_corpus[name_res_column] = np.select(conditions, choices, default='NONE')
    tweets_corpus[name_res_column] = tweets_corpus[name_res_column].astype(int)
    return tweets_corpus


tweets_corpus = create_cat_columns(tweets_corpus=tweets_corpus,
                                   choices=[1, 1, 0, -1, -1],
                                   name_res_column= "polarity_3_val")

tweets_corpus = create_cat_columns(tweets_corpus=tweets_corpus,
                                   choices=[0, 0, 0, 1, 1],
                                   name_res_column= "polarity_neg")

tweets_corpus = create_cat_columns(tweets_corpus=tweets_corpus,
                                   choices=[1, 1, 0, 0, 0],
                                   name_res_column= "polarity_pos")

In [5]:
tweets_corpus.head()

Unnamed: 0,text,polarity,polarty_val,polarity_3_val,polarity_neg,polarity_pos
0,@PauladeLasHeras No te libraras de ayudar me/n...,NEU,0,0,0,0
1,@marodriguezb Gracias MAR,P,1,1,0,1
2,"Off pensando en el regalito Sinde, la que se v...",N+,-2,-1,1,0
3,Conozco a alguien q es adicto al drama! Ja ja ...,P+,2,1,0,1
4,Toca @crackoviadeTV3 . Grabación dl especial N...,P+,2,1,0,1


The next step is the preprocessing, we will clean the text column and lemmatize de words. It is very important to notice that the stopwords do not contain words like "si" and "no" ("yes" and "no"). These are words that are very important for sentiment.

In [6]:
def create_custom_stop_words(list_new_stopwords, remove_words):
    custom_stop_words = list(set(stopwords.words('spanish')))
    custom_stop_words.extend(list_new_stopwords)
    custom_stop_words = [word for word in custom_stop_words if word not in remove_words]
    return custom_stop_words

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

def clean_text(s):
    filters = [gsp.strip_tags,
               gsp.strip_punctuation,
               gsp.strip_multiple_whitespaces,
               gsp.strip_numeric]
    s = re.sub(r'http\S+', '', s)
    s = s.lower()
    s = utils.to_unicode(s)
    s = utils.deaccent(s)
    for f in filters:
        s = f(s)
    return s

In [7]:
stopwords_nltk = set(stopwords.words('spanish'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
nlp = spacy.load("es_core_news_sm")
nlp.add_pipe(lemmatize, after="tagger")


list_new_stopwords = ["aun", "ser", "ver", "hoy", "ustedes", "aqui",
                              "vamos", "haber", "hacer", "tener", "ir",
                              "decir", "comer","asi", "pues"]
remove_words = ["no", "si", "sí"]

custom_stop_words = create_custom_stop_words(list_new_stopwords, remove_words)

tweets_corpus["content_clean"] =  tweets_corpus["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom_stop_words)]))
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].apply(lambda x: clean_text(x))
tweets_corpus = tweets_corpus.dropna(subset=["content_clean"])
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace('\s+', ' ', regex=True)
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace("", np.nan)
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace(" ", np.nan)
tweets_corpus = tweets_corpus.dropna(subset=["content_clean"])

tweets_corpus = tweets_corpus[tweets_corpus['content_clean'].apply(lambda x: len(x) > 3)]
tweets_corpus.dropna(subset=['content_clean'], inplace=True)
tweets_corpus.reset_index(drop=True, inplace=True)

tweets_corpus["lemma_clean_text"] = tweets_corpus["content_clean"].apply(lambda x: lemmatizer(x))

In [8]:
tweets_corpus.head()

Unnamed: 0,text,polarity,polarty_val,polarity_3_val,polarity_neg,polarity_pos,content_clean,lemma_clean_text
0,@PauladeLasHeras No te libraras de ayudar me/n...,NEU,0,0,0,0,pauladelasheras no libraras ayudar me nos bes...,pauladelasheras no libraras ayudar me nos be...
1,@marodriguezb Gracias MAR,P,1,1,0,1,marodriguezb gracias mar,marodriguezb gracia mar
2,"Off pensando en el regalito Sinde, la que se v...",N+,-2,-1,1,0,off pensando regalito sinde va sgae van corrup...,off pensar regalito sinde ir sgae ir corrupto ...
3,Conozco a alguien q es adicto al drama! Ja ja ...,P+,2,1,0,1,conozco alguien q adicto drama ja ja ja suena ...,conocer alguien q adicto drama ja ja ja sonar ...
4,Toca @crackoviadeTV3 . Grabación dl especial N...,P+,2,1,0,1,toca crackoviadetv grabacion dl especial navid...,toca crackoviadetv grabacion dl especial navid...


In [9]:
tweets_corpus["polarity"].value_counts()

P+     21888
N      12562
N+      5379
P       2534
NEU     1958
Name: polarity, dtype: int64

As it can be seen the dataset is unbalanced when it comes to neutral tweets. This will be our biggest challenge.

We will separate 5% of the total tweets so we can do a final metric checkup at the end.

In [10]:
#separate 5% of posts for final checkup
msk = np.random.rand(len(tweets_corpus)) < 0.9
test_corpus = tweets_corpus[~msk]
tweets_corpus = tweets_corpus[msk]
tweets_corpus.reset_index(drop=True, inplace = True)


The next step is to train the models, we will use 7 different algorithms with the intention to find which one is the best.


In [11]:
def get_feature_vector(train_fit, custom_stop_words):
    vector = TfidfVectorizer(sublinear_tf=True, max_df=.9, stop_words=custom_stop_words)
    vector.fit(train_fit)
    return vector

def create_train_test(tweets_corpus, col_name_label, col_name_text, custom_stop_words):
    # Same tf vector will be used for Testing sentiments on unseen trending data
    tf_vector = get_feature_vector(np.array(tweets_corpus.loc[:, col_name_text]).ravel(),
                                   custom_stop_words)
    X = tf_vector.transform(np.array(tweets_corpus.loc[:, col_name_text]).ravel())
    y = np.array(tweets_corpus.loc[:, col_name_label]).ravel()
    indices = list(tweets_corpus.index)
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.2, random_state=125)
    return X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector

def train_model(model_selected, X_train, X_test, y_train, y_test):
    model = model_selected
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print()
    print(confusion_matrix(y_test, y_predict))
    print()
    print(classification_report(y_test, y_predict))
    return model, y_predict

Our first attempt is to use the data which contains 3 labels.


In [12]:
X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector = create_train_test(tweets_corpus, "polarity_3_val", 'lemma_clean_text', custom_stop_words)

model_svc, y_predict_svc= train_model(SVC(kernel="linear", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_rbf, y_predict_rbf = train_model(SVC(kernel="rbf", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_nb, y_predict_nb = train_model(MultinomialNB(alpha= 0.5),
                                       X_train, X_test, y_train, y_test)

model_LR, y_predict_LR = train_model(LogisticRegression(solver='lbfgs'),
                                       X_train, X_test, y_train, y_test)

model_RF, y_predict_RF = train_model(RandomForestClassifier(max_depth= 14, max_features= 'sqrt',n_estimators= 10),
                                       X_train, X_test, y_train, y_test)

model_GB, y_predict_GB = train_model(GradientBoostingClassifier(max_depth= 14, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)

model_XG, y_predict_XG = train_model(XGBClassifier(eta= 1, gamma= 1, reg_lambda = 5, 
                                                           max_depth= 12, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)



[[2855    0  376]
 [ 210    0  131]
 [ 406    1 4030]]

              precision    recall  f1-score   support

          -1       0.82      0.88      0.85      3231
           0       0.00      0.00      0.00       341
           1       0.89      0.91      0.90      4437

    accuracy                           0.86      8009
   macro avg       0.57      0.60      0.58      8009
weighted avg       0.82      0.86      0.84      8009


[[2831    0  400]
 [ 215    0  126]
 [ 387    0 4050]]

              precision    recall  f1-score   support

          -1       0.82      0.88      0.85      3231
           0       0.00      0.00      0.00       341
           1       0.89      0.91      0.90      4437

    accuracy                           0.86      8009
   macro avg       0.57      0.60      0.58      8009
weighted avg       0.82      0.86      0.84      8009


[[2730    0  501]
 [ 213    0  128]
 [ 521    1 3915]]

              precision    recall  f1-score   support

          -1

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



[[2818    2  411]
 [ 199    5  137]
 [ 370    2 4065]]

              precision    recall  f1-score   support

          -1       0.83      0.87      0.85      3231
           0       0.56      0.01      0.03       341
           1       0.88      0.92      0.90      4437

    accuracy                           0.86      8009
   macro avg       0.76      0.60      0.59      8009
weighted avg       0.85      0.86      0.84      8009


[[ 610    0 2621]
 [  33    0  308]
 [  69    0 4368]]

              precision    recall  f1-score   support

          -1       0.86      0.19      0.31      3231
           0       0.00      0.00      0.00       341
           1       0.60      0.98      0.74      4437

    accuracy                           0.62      8009
   macro avg       0.49      0.39      0.35      8009
weighted avg       0.68      0.62      0.54      8009



  _warn_prf(average, modifier, msg_start, len(result))



[[2674   18  539]
 [ 188   11  142]
 [ 424   14 3999]]

              precision    recall  f1-score   support

          -1       0.81      0.83      0.82      3231
           0       0.26      0.03      0.06       341
           1       0.85      0.90      0.88      4437

    accuracy                           0.83      8009
   macro avg       0.64      0.59      0.59      8009
weighted avg       0.81      0.83      0.82      8009


[[2714   25  492]
 [ 194   14  133]
 [ 432   14 3991]]

              precision    recall  f1-score   support

          -1       0.81      0.84      0.83      3231
           0       0.26      0.04      0.07       341
           1       0.86      0.90      0.88      4437

    accuracy                           0.84      8009
   macro avg       0.65      0.59      0.59      8009
weighted avg       0.82      0.84      0.82      8009



As it can be seen these models seem to be able to predict positive and negative tweets with an accuracy of around 80-90%, but they just can't seem to be able to detect neutral tweets.

To try to improve the models we will try a different approach, we will create models that detect positive tweets, and models that detect negative tweets. We will combine these models into a single one.

In [13]:
X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector = create_train_test(tweets_corpus, "polarity_neg", 'lemma_clean_text', custom_stop_words)

model_svc_neg, y_predict_svc_neg = train_model(SVC(kernel="linear", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_rbf_neg, y_predict_rbf_neg = train_model(SVC(kernel="rbf", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_nb_neg, y_predict_nb_neg = train_model(MultinomialNB(alpha= 0.5),
                                       X_train, X_test, y_train, y_test)

model_LR_neg, y_predict_LR_neg = train_model(LogisticRegression(solver='lbfgs'),
                                       X_train, X_test, y_train, y_test)

model_RF_neg, y_predict_RF_neg = train_model(RandomForestClassifier(max_depth= 14, max_features= 'sqrt',n_estimators= 10),
                                       X_train, X_test, y_train, y_test)

model_GB_neg, y_predict_GB_neg = train_model(GradientBoostingClassifier( 
                                                           max_depth= 14,n_estimators= 500),
                                       X_train, X_test, y_train, y_test)

model_XG_neg, y_predict_XG_neg = train_model(XGBClassifier(eta= 1, gamma= 1, reg_lambda = 5, 
                                                           max_depth= 12, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)


X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector = create_train_test(tweets_corpus, "polarity_pos", 'lemma_clean_text', custom_stop_words)

model_svc_pos, y_predict_svc_pos = train_model(SVC(kernel="linear", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_rbf_pos, y_predict_rbf_pos = train_model(SVC(kernel="rbf", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_nb_pos, y_predict_nb_pos = train_model(MultinomialNB(alpha= 0.5),
                                       X_train, X_test, y_train, y_test)

model_LR_pos, y_predict_LR_pos = train_model(LogisticRegression(solver='lbfgs'),
                                       X_train, X_test, y_train, y_test)

model_RF_pos, y_predict_RF_pos = train_model(RandomForestClassifier(max_depth= 14, max_features= 'sqrt', n_estimators= 10),
                                       X_train, X_test, y_train, y_test)

model_GB_pos, y_predict_GB_pos = train_model(GradientBoostingClassifier(
                                                           max_depth= 14,n_estimators= 500),
                                       X_train, X_test, y_train, y_test)

model_XG_pos, y_predict_XG_pos = train_model(XGBClassifier(eta= 1, gamma= 1, reg_lambda = 5, 
                                                           max_depth= 12, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)



[[4260  518]
 [ 473 2758]]

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      4778
           1       0.84      0.85      0.85      3231

    accuracy                           0.88      8009
   macro avg       0.87      0.87      0.87      8009
weighted avg       0.88      0.88      0.88      8009


[[4285  493]
 [ 511 2720]]

              precision    recall  f1-score   support

           0       0.89      0.90      0.90      4778
           1       0.85      0.84      0.84      3231

    accuracy                           0.87      8009
   macro avg       0.87      0.87      0.87      8009
weighted avg       0.87      0.87      0.87      8009


[[4175  603]
 [ 640 2591]]

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4778
           1       0.81      0.80      0.81      3231

    accuracy                           0.84      8009
   macro avg       0.84      0.84      0.

In [14]:
col_name_text = "lemma_clean_text"
df = test_corpus

X_new = tf_vector.transform(np.array(df.loc[:, col_name_text]).ravel())
df_predicts = pd.DataFrame()

df_predicts["predict_svc_neg"] = model_svc_neg.predict(X_new)
df_predicts["predict_rbf_neg"] = model_rbf_neg.predict(X_new)
df_predicts["predict_nb_neg"] = model_nb_neg.predict(X_new)
df_predicts["predict_LR_neg"] = model_LR_neg.predict(X_new)
df_predicts["predict_RF_neg"] = model_RF_neg.predict(X_new)
df_predicts["predict_GB_neg"] = model_GB_neg.predict(X_new)
df_predicts["predict_XG_neg"] = model_XG_neg.predict(X_new)

df_predicts["predict_svc_pos"] = model_svc_pos.predict(X_new)
df_predicts["predict_rbf_pos"] = model_rbf_pos.predict(X_new)
df_predicts["predict_nb_pos"] = model_nb_pos.predict(X_new)
df_predicts["predict_LR_pos"] = model_LR_pos.predict(X_new)
df_predicts["predict_RF_pos"] = model_RF_pos.predict(X_new)
df_predicts["predict_GB_pos"] = model_GB_pos.predict(X_new)
df_predicts["predict_XG_pos"] = model_XG_pos.predict(X_new)

def check_model(name_model, df_predicts):
    pos_model = "predict_" + name_model +"_pos"
    neg_model = "predict_" + name_model +"_neg"

    conditions = [(df_predicts[pos_model]  == 0) & (df_predicts[neg_model]  == 0),
                      (df_predicts[pos_model]  == 1) & (df_predicts[neg_model]  == 1),
                (df_predicts[pos_model]  == 1),
                (df_predicts[neg_model]  == 1)]
    choices = ["neu", "neu", "pos", "neg"]
    test_corpus[name_model] = np.select(conditions, choices, default='neu')

    conditions = [
            (test_corpus["polarity_3_val"]  == -1),
            (test_corpus["polarity_3_val"]  == 1),
            (test_corpus["polarity_3_val"]  == 0)]
    choices = ["neg", "pos", "neu"]
    test_corpus["tag_sent"] = np.select(conditions, choices, default='NONE')

    print(classification_report(test_corpus["tag_sent"], test_corpus[name_model]))
    print(confusion_matrix(test_corpus["tag_sent"], test_corpus[name_model]))


print("svc results:")
check_model("svc",df_predicts)
print("rbf results:")
check_model("rbf",df_predicts)
print("nb results:")
check_model("nb",df_predicts)
print("LR results:")
check_model("LR",df_predicts)
print("RF results:")
check_model("RF",df_predicts)
print("XG results:")
check_model("XG",df_predicts)
print("GB results:")
check_model("GB",df_predicts)

svc results:
              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      1759
         neu       0.14      0.20      0.17       187
         pos       0.91      0.89      0.90      2332

    accuracy                           0.84      4278
   macro avg       0.64      0.65      0.64      4278
weighted avg       0.85      0.84      0.85      4278

[[1484  122  153]
 [  86   38   63]
 [ 150  108 2074]]
rbf results:
              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      1759
         neu       0.15      0.21      0.17       187
         pos       0.91      0.89      0.90      2332

    accuracy                           0.84      4278
   macro avg       0.64      0.65      0.64      4278
weighted avg       0.85      0.84      0.85      4278

[[1475  129  155]
 [  86   39   62]
 [ 153   94 2085]]
nb results:
              precision    recall  f1-score   support

         neg       0.82      0.79  

As it can be seen these combined models seem to work a little better than the multilabel ones. In particular we will choose the Linear SVC to create our full model.

First we will save all the models (and the tfidf vector) in case we will want to use them later.

In [15]:
#let´s save the models.
path_model = "D:/sentiment_analysis/models/"

with open(path_model + 'model_svc_neg', 'wb') as picklefile:
    pickle.dump(model_svc_neg, picklefile)

with open(path_model + 'model_rbf_neg', 'wb') as picklefile:
    pickle.dump(model_rbf_neg,picklefile)

with open(path_model + 'model_nb_neg', 'wb') as picklefile:
    pickle.dump(model_nb_neg,picklefile)

with open(path_model + 'model_LR_neg', 'wb') as picklefile:
    pickle.dump(model_LR_neg,picklefile)

with open(path_model + 'model_RF_neg', 'wb') as picklefile:
    pickle.dump(model_RF_neg,picklefile)
    
with open(path_model + 'model_GB_neg', 'wb') as picklefile:
    pickle.dump(model_GB_neg,picklefile)

with open(path_model + 'model_XG_neg', 'wb') as picklefile:
    pickle.dump(model_XG_neg,picklefile)
    
with open(path_model + 'model_svc_pos', 'wb') as picklefile:
    pickle.dump(model_svc_pos,picklefile)

with open(path_model + 'model_rbf_pos', 'wb') as picklefile:
    pickle.dump(model_rbf_pos,picklefile)

with open(path_model + 'model_nb_pos', 'wb') as picklefile:
    pickle.dump(model_nb_pos,picklefile)

with open(path_model + 'model_LR_pos', 'wb') as picklefile:
    pickle.dump(model_LR_pos,picklefile)

with open(path_model + 'model_RF_pos', 'wb') as picklefile:
    pickle.dump(model_RF_pos,picklefile)
    
with open(path_model + 'model_GB_pos', 'wb') as picklefile:
    pickle.dump(model_GB_pos,picklefile)

with open(path_model + 'model_XG_pos', 'wb') as picklefile:
    pickle.dump(model_XG_pos,picklefile)

with open(path_model + 'vectorizer.pk', 'wb') as fin:
     pickle.dump(tf_vector, fin)

Finally, we will combine all of the pieces of this model into one class that will be used to predict the sentiment of new texts.

In [16]:
class SentimentAnalysis:
    
    path_model = "D:/sentiment_analysis/models/"
    with open(path_model + 'model_svc_neg', 'rb') as training_model:
        model_svc_neg = pickle.load(training_model)

    with open(path_model + 'model_svc_pos', 'rb') as training_model:
        model_svc_pos = pickle.load(training_model)

    with open(path_model + 'vectorizer.pk', 'rb') as fin:
        tf_vector = pickle.load(fin)
    
    def create_custom_stop_words(self, list_new_stopwords, remove_words):
        custom_stop_words = list(set(stopwords.words('spanish')))
        custom_stop_words.extend(list_new_stopwords)
        custom_stop_words = [word for word in custom_stop_words if word not in remove_words]
        return custom_stop_words

    def lemmatizer(self, text,nlp):
        sent = []
        doc = nlp(text)
        for word in doc:
            sent.append(word.lemma_)
        return " ".join(sent)

    def clean_text(self, s):
        filters = [gsp.strip_tags,
                   gsp.strip_punctuation,
                   gsp.strip_multiple_whitespaces,
                   gsp.strip_numeric]
        s = re.sub(r'http\S+', '', s)
        s = s.lower()
        s = utils.to_unicode(s)
        s = utils.deaccent(s)
        for f in filters:
            s = f(s)
        return s
    
    def preprocess(self, text, list_new_stopwords, remove_words, nlp):
        custom_stop_words = self.create_custom_stop_words(list_new_stopwords, remove_words)
        text = ' '.join([word for word in text.split() if word not in (custom_stop_words)])
        text = self.clean_text(text)
        text = self.lemmatizer(text, nlp)
        return text
    
    def get_sentiment(self, text, model_neg= model_svc_neg, model_pos=model_svc_pos , vector= tf_vector):
        stopwords_nltk = set(stopwords.words('spanish'))
        exclude = set(string.punctuation)
        lemma = WordNetLemmatizer()
        nlp = spacy.load("es_core_news_sm")
        nlp.add_pipe(lemmatize, after="tagger")

        list_new_stopwords = ["aun", "ser", "ver", "hoy", "ustedes", "aqui",
                                      "vamos", "haber", "hacer", "tener", "ir",
                                      "decir", "comer","asi", "pues"]
        remove_words = ["no", "si", "sí"]
        text = self.preprocess(text, list_new_stopwords, remove_words, nlp)
        
        X_new = vector.transform([text])

        predict_svc_neg = model_neg.predict(X_new)
        predict_svc_pos = model_pos.predict(X_new)

        pred = ""
        if (predict_svc_neg[0]== 1) & (predict_svc_pos[0]== 1):
            pred = "mix"
        elif (predict_svc_neg[0]== 0) & (predict_svc_pos[0]== 0):
            pred = "neutral"
        elif (predict_svc_neg[0]== 1) & (predict_svc_pos[0]== 0):
            pred = "negative"
        elif (predict_svc_neg[0]== 0)  & (predict_svc_pos[0]== 1):
            pred = "positive"
        return pred

        

In [17]:
sentiment = SentimentAnalysis()
text = "odio este programa de television"
sentiment.get_sentiment(text)

'negative'

In [18]:
text = "Me encanta este programa de television"
sentiment.get_sentiment(text)


'positive'

In [19]:
text = "Mi modelo funciona"
sentiment.get_sentiment(text)


'positive'

In [20]:
text = "Mi no modelo funciona"
sentiment.get_sentiment(text)


'negative'