In [166]:
SAVE_MODEL_TO_DISK = 0
LOAD_MODEL = 1

# GENERAL LIBS
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
%matplotlib inline

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

#NLTK
from nltk.stem.snowball import SnowballStemmer

# GENSIM
from gensim.sklearn_api import W2VTransformer
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweetsDF = pd.read_csv("preprocessed2.csv")

tweetsDF.drop(labels=["Unnamed: 0",
                      "airline", 
                      "negativereason", 
                      "airline_sentiment_confidence", 
                      "negativereason",
                      "negativereason_confidence",
                      "airline_sentiment",
                      "text"], axis=1, inplace=True)

In [3]:
def stemming(tokens):
    '''
    Apply stemming to each token
    
    @return:
        Return a list of stemmed tokens
    '''
    
    stemmer = SnowballStemmer("english")  
    stemmed = [stemmer.stem(w) for w in tokens.split()]
    return stemmed

In [4]:
tweetsDF["tweet2words"] = tweetsDF["tweet2words"].values.astype("U")
tweetsDF["correctedText"] = tweetsDF["correctedText"].values.astype("U")
tweetsDF.drop(labels=['tweet2words'], axis=1, inplace=True)

# tweetsDF["correctedText"] = tweetsDF["correctedText"].apply(stemming)

In [5]:
tweetsDF.head(2)

Unnamed: 0,sentiment,num_capitalized,tweet_length,num_negative_words,num_positive_words,num_neutral_words,has_capitalized,num_capitalised_positive_words,num_capitalised_negative_words,num_hashtags,num_special_character,correctedText
0,1,0,3,0,0,4,1,0,0,0,3,What said
1,1,0,6,0,0,9,0,0,0,0,4,plus added commercials experience tacky


In [6]:
# # sentences = Sentences(tweetsDF['correctedText'].tolist()) # a memory-friendly iterator
# tweetsList = tweetsDF['correctedText']
# tweetsList[:5]


In [7]:
# onlyWords = list()
# for w in tweetsList:
#     try:
#         onlyWords.append(list(w))
#     except AttributeError:
#         onlyWords.append([""])
# onlyWords[:5]

In [8]:
# model = W2VTransformer(size=10, min_count=1, seed=1)

# try: 
#     wordvecs = model.fit(onlyWords[:5000]).transform(onlyWords[5000:])
# except KeyError:
#     pass

# wordvecs = model.fit(onlyWords[:-1]).transform(onlyWords[:-1])

In [9]:
target = "sentiment"
features = [c for c in tweetsDF.columns.values if c not in [target]]
numeric_features =  [c for c in tweetsDF.columns.values if c not in ['tweet2words', 'correctedText', target]]

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(tweetsDF[features], tweetsDF[target], test_size=0.33, random_state=42)

In [11]:
X_train['correctedText'][5]

' seriously would pay flight seats playing really bad thing flying VA'

In [12]:
print("X_train: " + repr(X_train.shape))
print("X_test: " + repr(X_test.shape))
print("Y_train: " + repr(Y_train.shape))
print("Y_test: " + repr(Y_test.shape))

X_train: (9808, 11)
X_test: (4832, 11)
Y_train: (9808,)
Y_test: (4832,)


---
A criação dos pipelines foi feita utilizando os seguintes kernels no kaggle como auxiliadores;
1. [Building A Scikit Learn Classification Pipeline](https://www.kaggle.com/gautham11/building-a-scikit-learn-classification-pipeline)
2. [A Deep Dive Into Sklearn Pipelines](https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines)
---

In [13]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def splitString(self, s):
        try:
            return s.split()
        except AttributeError:
            return ""
            
    
    def transform(self, X):
        # Apply the word2vec transformation
        a = X[self.key]
#         return wordvecs.fit_transform(a)
        return a
 

In [14]:
# class Word2VecTransformation(BaseEstimator, TransformerMixin):
#     def __init__(self, key):
#         self.key = key

#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         print(type(X))

In [15]:
class Senteces(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [16]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [17]:
text = Pipeline([
                ('selector', TextSelector(key='correctedText')),
                ('countVec', CountVectorizer(analyzer = "word"))
#                 ('w2v', Word2VecTransformation)
#                 ('w2v', Word2Vec())
            ])

# text.fit_transform(X_train)

In [18]:
# negConfidence = Pipeline([
#                     ('selector', NumberSelector(key='negativereason_confidence')),
#                     ('standard', StandardScaler())
#                 ])

numCapitalized = Pipeline([
                    ('selector', NumberSelector(key='num_capitalized')),
                    ('standard', StandardScaler())
                ])

tweetLength = Pipeline([
                    ('selector', NumberSelector(key='tweet_length')),
                    ('standard', StandardScaler())
                ])

numNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_negative_words')),
                    ('standard', StandardScaler())
                ])

numPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_positive_words')),
                    ('standard', StandardScaler())
                ])

numNeutralWords = Pipeline([
                    ('selector', NumberSelector(key='num_neutral_words')),
                    ('standard', StandardScaler())
                ])


numCapitalizedPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_positive_words')),
                    ('standard', StandardScaler())
                ])

numCapitalizedNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_negative_words')),
                    ('standard', StandardScaler())
                ])

numHashtags = Pipeline([
                    ('selector', NumberSelector(key='num_hashtags')),
                    ('standard', StandardScaler())
                ])


numSpecialCharacter = Pipeline([
                    ('selector', NumberSelector(key='num_special_character')),
                    ('standard', StandardScaler())
                ])

In [19]:
feats = FeatureUnion([('text', text),
                      ('numCapitalized', numCapitalized),
                      ("tweetLength", tweetLength),
                      ("numNegativeWords", numNegativeWords),
                      ("numPositiveWords", numPositiveWords),
                      ("numNeutralWords", numNeutralWords),
                      ("numCapitalizedPositiveWords", numCapitalizedPositiveWords),
                      ("numCapitalizedNegativeWords", numCapitalizedNegativeWords),
                      ("numHashtags", numHashtags),
                      ("numSpecialCharacter", numSpecialCharacter)
                     ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<9808x8269 sparse matrix of type '<class 'numpy.float64'>'
	with 156963 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state = 42)),
])

pipeline.fit(X_train, Y_train)

preds = pipeline.predict(X_test)
np.mean(preds == Y_test)

0.8222268211920529

In [48]:
if LOAD_MODEL == 0:
    clfs = list()
    # clfs.append(LogisticRegression())
    clfs.append(SVC())
    clfs.append(DecisionTreeClassifier())
    # clfs.append(RandomForestClassifier(n_estimators=200, random_state = 42))
    # clfs.append(GradientBoostingClassifier())
    clfs.append(MLPClassifier())
#     clfs.append(MultinomialNB())

    scores = list()
    pipelineList = list()
    scores2 = list()


    for c in clfs:
        pipeline.set_params(classifier = c)
        pipeline.fit(X_train, Y_train)
        s = cross_validate(pipeline, X_train, Y_train, 
                           scoring=["accuracy", "recall", "precision", "f1"], 
                           cv=3, return_estimator = True)
        scores.append(s)
        pipelineList.append(pipeline)
    #     scores2.append(pipeline.score(X_train, Y_train))
        print('---------------------------------')
        print(str(c))
        print('-----------------------------------')
#         for key, values in s.items():
#                 print(key,' mean ', values.mean())
#                 print(key,' std ', values.std())

---------------------------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
-----------------------------------
---------------------------------
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
-----------------------------------
---------------------------------
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=20

In [118]:
bestModels = list()
for model in scores:
    maxAcc = max(model['test_accuracy'])
    bestModelX = np.where(model['test_accuracy'] == maxAcc)
    bestModels.append(model['estimator'][0])

In [132]:
for i in range(len(bestModels)):
    name = bestModels[i].get_params('classifier')
    print(str(name['classifier']).split('(')[0])

SVC
DecisionTreeClassifier
MLPClassifier


In [133]:
# svm = scores[0]
# maxAcc = max(svm['test_accuracy'])
# clf = svm['estimator'][np.where(model['test_accuracy'] == maxAcc)]

In [170]:
import pickle

from sklearn.externals import joblib

if SAVE_MODEL_TO_DISK == 1:
#     for i in range(len(pipelineList)):
#         joblib.dump(pipelineList[i], "model[" +str(i)+ "].joblib") 
    for i in range(len(bestModels)):
        model = bestModels[i].get_params('classifier')
        file = str(model['classifier']).split('(')[0]
        joblib.dump(bestModels[i], file + ".joblib") 
elif LOAD_MODEL == 1:
    bestModels = list()
    bestModels.append(joblib.load('DecisionTreeClassifier.joblib'))


In [172]:
from os import listdir
from os.path import isfile, join

tweetsPath = '03_processed/'
onlyfiles = [f for f in listdir(tweetsPath) if isfile(join(tweetsPath, f))]
model = bestModels[0]

for file in onlyfiles:
    newTweets = pd.read_csv(tweetsPath + file)
    newTweets.drop(labels=['text', 'tweet2words'], axis = 1, inplace=True)
    newTweets['correctedText'] = newTweets["correctedText"].values.astype("U")
    newTweets['sentiment'] = model.predict(newTweets)
    newTweets.to_csv('04_output/dec_tree/'+ file.split('.')[0] + '_prediction.csv')

In [151]:
from sklearn.metrics import confusion_matrix

# bestModel = clf[0]['estimator'][0]
allMeasures = dict()
for i in range(len(bestModels)):
    model = bestModels[i]
    tn, fp, fn, tp = confusion_matrix(model.predict(X_test), Y_test).ravel()

    measures = dict()
    measures["acc"] = (tp + tn)/(tn + fp + fn + tp) * 100
    prec = tp/(tp + fp) * 100
    recall = tp/(tp + fn) * 100
    measures["prec"] = prec
    measures["recall_sens"] = recall
    measures["f1_score"] = (2 * prec * recall/(prec + recall))
    measures['miss_rate'] = (fp + fn) / float(tp + tn + fp + fn) * 100
    measures['spec'] = tn/float(tn + fp) * 100
    measures['fp_rate'] = fp/float(tn + fp) * 100
    allMeasures[i] = measures

In [156]:
# allMeasures
performance = pd.DataFrame.from_dict(data=allMeasures, orient='index')
performance.set_axis(labels=['SVC', 'DecisionTreeClassifier', 'MLPClassifier'], inplace=True)
performance

Unnamed: 0,acc,prec,recall_sens,f1_score,miss_rate,spec,fp_rate
SVC,68.067053,12.936463,91.129032,22.656642,31.932947,66.819372,33.180628
DecisionTreeClassifier,73.903146,66.742988,63.163597,64.90398,26.096854,80.542532,19.457468
MLPClassifier,77.607616,68.231254,69.342641,68.782458,22.392384,82.171539,17.828461
