## Análise de sentimentos sobre linhas aéreas (Parte 2/4)

Este notebook faz parte de um conjunto de notebooks com o objetivo de realizar a tarefa de análise de sentimentos em tweets sobre linhas aéreas americanas. 

Neste segundo notebook, iremos treinar alguns modelos de machine learning. A base utilizada para treino se encontra [aqui](https://www.kaggle.com/crowdflower/twitter-airline-sentiment). 

In [1]:
# Variavel que indica se queremos salvar um modelo treinando no HD
SAVE_MODEL_TO_DISK = 0

# Variavel que indica se queremos carregar um modelo salvo no HD
LOAD_MODEL = 1

# GENERAL LIBS
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
%matplotlib inline

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

#NLTK
from nltk.stem.snowball import SnowballStemmer

# GENSIM
from gensim.sklearn_api import W2VTransformer
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweetsDF = pd.read_csv("preprocessed2.csv")

tweetsDF.drop(labels=["Unnamed: 0",
                      "airline", 
                      "negativereason", 
                      "airline_sentiment_confidence", 
                      "negativereason",
                      "negativereason_confidence",
                      "airline_sentiment",
                      "text"], axis=1, inplace=True)

In [3]:
def stemming(tokens):
    '''
    Apply stemming to each token
    
    @return:
        Return a list of stemmed tokens
    '''
    
    stemmer = SnowballStemmer("english")  
    stemmed = [stemmer.stem(w) for w in tokens.split()]
    return stemmed

In [4]:
tweetsDF["tweet2words"] = tweetsDF["tweet2words"].values.astype("U")
tweetsDF["correctedText"] = tweetsDF["correctedText"].values.astype("U")
tweetsDF.drop(labels=['tweet2words'], axis=1, inplace=True)

# tweetsDF["correctedText"] = tweetsDF["correctedText"].apply(stemming)

In [5]:
tweetsDF.head(2)

Unnamed: 0,sentiment,num_capitalized,tweet_length,num_negative_words,num_positive_words,num_neutral_words,has_capitalized,num_capitalised_positive_words,num_capitalised_negative_words,num_hashtags,num_special_character,correctedText
0,1,0,3,0,0,4,1,0,0,0,3,What said
1,1,0,6,0,0,9,0,0,0,0,4,plus added commercials experience tacky


In [6]:
target = "sentiment"
features = [c for c in tweetsDF.columns.values if c not in [target]]
numeric_features =  [c for c in tweetsDF.columns.values if c not in ['tweet2words', 'correctedText', target]]

In [7]:
# X_train, X_test, Y_train, Y_test = train_test_split(tweetsDF[features], tweetsDF[target], test_size=0.33, random_state=42)
X_train = tweetsDF[features]
Y_train = tweetsDF[target]

In [8]:
X_train['correctedText'][5]

' seriously would pay flight seats playing really bad thing flying VA'

In [9]:
print("X_train: " + repr(X_train.shape))
# print("X_test: " + repr(X_test.shape))
print("Y_train: " + repr(Y_train.shape))
# print("Y_test: " + repr(Y_test.shape))

X_train: (14640, 11)
Y_train: (14640,)


---
A criação dos pipelines foi feita utilizando os seguintes kernels no kaggle como auxiliadores;
1. [Building A Scikit Learn Classification Pipeline](https://www.kaggle.com/gautham11/building-a-scikit-learn-classification-pipeline)
2. [A Deep Dive Into Sklearn Pipelines](https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines)
---

In [10]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def splitString(self, s):
        try:
            return s.split()
        except AttributeError:
            return ""
            
    
    def transform(self, X):
        # Apply the word2vec transformation
        a = X[self.key]
#         return wordvecs.fit_transform(a)
        return a
 

In [11]:
class Senteces(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [12]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [13]:
text = Pipeline([
                ('selector', TextSelector(key='correctedText')),
                ('countVec', CountVectorizer(analyzer = "word"))
            ])

In [14]:
numCapitalized = Pipeline([
                    ('selector', NumberSelector(key='num_capitalized')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

tweetLength = Pipeline([
                    ('selector', NumberSelector(key='tweet_length')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

numNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_negative_words')),
                    #('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

numPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_positive_words')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

numNeutralWords = Pipeline([
                    ('selector', NumberSelector(key='num_neutral_words')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])


numCapitalizedPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_positive_words')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

numCapitalizedNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_negative_words')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

numHashtags = Pipeline([
                    ('selector', NumberSelector(key='num_hashtags')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])


numSpecialCharacter = Pipeline([
                    ('selector', NumberSelector(key='num_special_character')),
#                     ('standard', StandardScaler())
                    ('standard', MinMaxScaler())
                ])

In [15]:
feats = FeatureUnion([('text', text),
                      ('numCapitalized', numCapitalized),
                      ("tweetLength", tweetLength),
                      ("numNegativeWords", numNegativeWords),
                      ("numPositiveWords", numPositiveWords),
                      ("numNeutralWords", numNeutralWords),
                      ("numCapitalizedPositiveWords", numCapitalizedPositiveWords),
                      ("numCapitalizedNegativeWords", numCapitalizedNegativeWords),
                      ("numHashtags", numHashtags),
                      ("numSpecialCharacter", numSpecialCharacter)
                     ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<14640x9769 sparse matrix of type '<class 'numpy.float64'>'
	with 190942 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state = 42)),
])

pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='correctedText')), ('countVec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',...ators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [17]:
if LOAD_MODEL == 0:
    clfs = list()
    
    clfs.append(SVC())
    clfs.append(DecisionTreeClassifier())
    clfs.append(MLPClassifier())
    clfs.append(MultinomialNB())

    scores = list()
    pipelineList = list()

    for c in clfs:
        pipeline.set_params(classifier = c)
        pipeline.fit(X_train, Y_train)
        s = cross_validate(pipeline, X_train, Y_train, 
                           scoring=["accuracy", "recall", "precision", "f1"], 
                           cv=10, return_estimator = True)
        scores.append(s)
        pipelineList.append(pipeline)
        
        print('---------------------------------')
        print(str(c))
        print('-----------------------------------')
#         for key, values in s.items():
#                 print(key,' mean ', values.mean())
#                 print(key,' std ', values.std())

In [18]:
bestModels = list()
for model in scores:
    maxAcc = max(model['test_accuracy'])
    bestModelX = np.where(model['test_accuracy'] == maxAcc)
    bestModels.append(model['estimator'][0])

NameError: name 'scores' is not defined

In [20]:
for i in range(len(bestModels)):
    name = bestModels[i].get_params('classifier')
    print(str(name['classifier']).split('(')[0])

SVC
DecisionTreeClassifier
MLPClassifier
MultinomialNB


In [133]:
# svm = scores[0]
# maxAcc = max(svm['test_accuracy'])
# clf = svm['estimator'][np.where(model['test_accuracy'] == maxAcc)]

In [20]:
import pickle
from os import listdir
from os.path import isfile, join

modelsPath = 'models/'
onlyfiles = [f for f in listdir(modelsPath) if isfile(join(modelsPath, f))]
onlyfiles

['SVC.joblib',
 'MLPClassifier.joblib',
 'DecisionTreeClassifier.joblib',
 'MultinomialNB.joblib']

In [21]:
from sklearn.externals import joblib

if SAVE_MODEL_TO_DISK == 1:
#     for i in range(len(pipelineList)):
#         joblib.dump(pipelineList[i], "model[" +str(i)+ "].joblib") 
    for i in range(len(bestModels)):
        model = bestModels[i].get_params('classifier')
        file = str(model['classifier']).split('(')[0]
        joblib.dump(bestModels[i], file + ".joblib") 
elif LOAD_MODEL == 1:
    bestModels = list()
    for file in onlyfiles:
        bestModels.append(joblib.load(modelsPath + file))

In [22]:
for model in bestModels:
    print(str(model.get_params('classifier')['classifier']).split('(')[0])

SVC
MLPClassifier
DecisionTreeClassifier
MultinomialNB


Agora que temos os melhores modelos treinados, vamos usá-los para classificar os tweets coletados

In [23]:
from os import listdir
from os.path import isfile, join

tweetsPath = '03_processed/'
onlyfiles = [f for f in listdir(tweetsPath) if isfile(join(tweetsPath, f))]
# model = bestModels[0]
for model in bestModels: 
    for file in onlyfiles:
        newTweets = pd.read_csv(tweetsPath + file)
        newTweets.drop(labels=['text', 'tweet2words'], axis = 1, inplace=True)
        newTweets['correctedText'] = newTweets["correctedText"].values.astype("U")
        newTweets['sentiment'] = model.predict(newTweets)
        classifierName = str(model.get_params('classifier')['classifier']).split('(')[0]
#         newTweets.to_csv('04_output/' + classifierName + '/' + file.split('.')[0] + '_prediction.csv')

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(tweetsDF[features], tweetsDF[target], test_size=0.33, random_state=42)

In [28]:
from sklearn.metrics import confusion_matrix

# bestModel = clf[0]['estimator'][0]
allMeasures = dict()
for i in range(len(bestModels)):
    model = bestModels[i]
    tn, fp, fn, tp = confusion_matrix(model.predict(X_train), Y_train).ravel()

    measures = dict()
    measures["acc"] = (tp + tn)/(tn + fp + fn + tp) * 100
    prec = tp/(tp + fp) * 100
    recall = tp/(tp + fn) * 100
    measures["prec"] = prec
    measures["recall_sens"] = recall
    measures["f1_score"] = (2 * prec * recall/(prec + recall))
    measures['miss_rate'] = (fp + fn) / float(tp + tn + fp + fn) * 100
    measures['spec'] = tn/float(tn + fp) * 100
    measures['fp_rate'] = fp/float(tn + fp) * 100
    allMeasures[i] = measures

In [29]:
performance = pd.DataFrame.from_dict(data=allMeasures, orient='index')
performance.set_axis(labels=['SVC', 'DecisionTreeClassifier', 'MLPClassifier', 'MultinomialNB'], inplace=True)
# performance.set_axis(labels=['NB'], inplace=True)
performance

Unnamed: 0,acc,prec,recall_sens,f1_score,miss_rate,spec,fp_rate
SVC,62.122757,0.0,,,37.877243,62.122757,37.877243
DecisionTreeClassifier,97.236949,95.989233,96.691974,96.339322,2.763051,97.565359,2.434641
MLPClassifier,97.410277,96.79677,96.381667,96.588772,2.589723,98.041797,1.958203
MultinomialNB,86.042007,74.266487,86.979823,80.121969,13.957993,85.593731,14.406269
