In [2]:
import numpy as np 
import pandas as pd

In [3]:
tweetsDF = pd.read_csv("preprocessed.csv")

tweetsDF.drop(labels=["Unnamed: 0",
                      "airline", 
                      "negativereason", 
                      "airline_sentiment_confidence", 
                      "negativereason",
                      "airline_sentiment",
                      "text"], axis=1, inplace=True)

In [4]:
tweetsDF.head(2)

Unnamed: 0,negativereason_confidence,sentiment,tweet2words,num_capitalized,tweet_length,num_negative_words,num_positive_words,num_neutral_words,has_capitalized,num_capitalised_positive_words,num_capitalised_negative_words,num_hashtags,num_special_character
0,13.160358,1,What said,0,3,0,0,4,1,0,0,0,3
1,0.0,1,plus added commercials experience tacky,0,6,0,0,9,0,0,0,0,4


In [5]:
tweetsDF["tweet2words"] = tweetsDF["tweet2words"].values.astype("U")

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
target = "sentiment"
features = [c for c in tweetsDF.columns.values if c not in [target]]
numeric_features =  [c for c in tweetsDF.columns.values if c not in ['tweet2words', target]]


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(tweetsDF[features], tweetsDF[target], test_size=0.33, random_state=42)

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

---
A criação dos pipelines foi feita utilizando os seguintes kernels no kaggle como auxiliadores;
1. [Building A Scikit Learn Classification Pipeline](https://www.kaggle.com/gautham11/building-a-scikit-learn-classification-pipeline)
2. [A Deep Dive Into Sklearn Pipelines](https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines)
---

In [10]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
 

In [11]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [69]:
tweetsDF.columns

Index(['negativereason_confidence', 'sentiment', 'tweet2words',
       'num_capitalized', 'tweet_length', 'num_negative_words',
       'num_positive_words', 'num_neutral_words', 'has_capitalized',
       'num_capitalised_positive_words', 'num_capitalised_negative_words',
       'num_hashtags', 'num_special_character'],
      dtype='object')

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
text = Pipeline([
                ('selector', TextSelector(key='tweet2words')),
                ('countVec', CountVectorizer(analyzer = "word"))
            ])

text.fit_transform(X_train)

<9808x10227 sparse matrix of type '<class 'numpy.int64'>'
	with 87497 stored elements in Compressed Sparse Row format>

In [74]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# negConfidence = Pipeline([
#                     ('selector', NumberSelector(key='negativereason_confidence')),
#                     ('standard', StandardScaler())
#                 ])

numCapitalized = Pipeline([
                    ('selector', NumberSelector(key='num_capitalized')),
                    ('standard', StandardScaler())
                ])

tweetLength = Pipeline([
                    ('selector', NumberSelector(key='tweet_length')),
                    ('standard', StandardScaler())
                ])

numNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_negative_words')),
                    ('standard', StandardScaler())
                ])

numPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_positive_words')),
                    ('standard', StandardScaler())
                ])

numNeutralWords = Pipeline([
                    ('selector', NumberSelector(key='num_neutral_words')),
                    ('standard', StandardScaler())
                ])


numCapitalizedPositiveWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_positive_words')),
                    ('standard', StandardScaler())
                ])

numCapitalizedNegativeWords = Pipeline([
                    ('selector', NumberSelector(key='num_capitalised_negative_words')),
                    ('standard', StandardScaler())
                ])

numHashtags = Pipeline([
                    ('selector', NumberSelector(key='num_hashtags')),
                    ('standard', StandardScaler())
                ])


numSpecialCharacter = Pipeline([
                    ('selector', NumberSelector(key='num_special_character')),
                    ('standard', StandardScaler())
                ])

In [73]:
from sklearn.pipeline import FeatureUnion

In [75]:
feats = FeatureUnion([('text', text),
                      ('numCapitalized', numCapitalized),
                      ("tweetLength", tweetLength),
                      ("numNegativeWords", numNegativeWords),
                      ("numPositiveWords", numPositiveWords),
                      ("numNeutralWords", numNeutralWords),
                      ("numCapitalizedPositiveWords", numCapitalizedPositiveWords),
                      ("numCapitalizedNegativeWords", numCapitalizedNegativeWords),
                      ("numHashtags", numHashtags),
                      ("numSpecialCharacter", numSpecialCharacter)
                     ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<9808x10236 sparse matrix of type '<class 'numpy.float64'>'
	with 156153 stored elements in Compressed Sparse Row format>

In [76]:
# from sklearn.ensemble import RandomForestClassifier

# pipeline = Pipeline([
#     ('features',feats),
#     ('classifier', RandomForestClassifier(n_estimators=200, random_state = 42)),
# ])

# pipeline.fit(X_train, Y_train)

# preds = pipeline.predict(X_test)
# np.mean(preds == Y_test)

In [77]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [78]:
clfs = list()
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier(n_estimators=200, random_state = 42))
clfs.append(GradientBoostingClassifier())
clfs.append(MLPClassifier())

scores = list()
pipelineList = list()
scores2 = list()

In [79]:
for c in clfs:
    pipeline.set_params(classifier = c)
    pipelineList.append(pipeline)
    s = cross_validate(pipeline, X_train, Y_train, scoring=["accuracy", "recall", "precision", "f1"])
    scores.append(s)
    pipeline.fit(X_train, Y_train)
    scores2.append(pipeline.score(X_train, Y_train))
    print('---------------------------------')
    print(str(c))
    print('-----------------------------------')
    for key, values in s.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())

---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  0.14806437492370605
fit_time  std  0.00904830257954355
score_time  mean  0.15582974751790366
score_time  std  0.000709279240053915
test_accuracy  mean  0.8119900314603967
test_accuracy  std  0.0035970302002531753
train_accuracy  mean  0.9449939420352488
train_accuracy  std  0.0010735654242785579
test_recall  mean  0.7440092088787361
test_recall  std  0.004413044720313696
train_recall  mean  0.9277257608990325
train_recall  std  0.0021510150326381878
test_precision  mean  0.7559041442220042
test_precision  std  0.007729003242404524
train_precision  mean  0.9271005124880194
train_precision  std  0.00078253041101434
test_f1  mean  0.74987602479

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


---------------------------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
-----------------------------------
fit_time  mean  6.12412444750468
fit_time  std  1.6273857448174296
score_time  mean  9.541389226913452
score_time  std  0.7998875095835369
test_accuracy  mean  0.6212275822455969
test_accuracy  std  8.956554920299112e-05
train_accuracy  mean  0.6212275725602616
train_accuracy  std  4.478962316774625e-05
test_recall  mean  0.0
test_recall  std  0.0
train_recall  mean  0.0
train_recall  std  0.0
test_precision  mean  0.0
test_precision  std  0.0
train_precision  mean  0.0
train_precision  std  0.0
test_f1  mean  0.0
test_f1  std  0.0
train_f1  mean  0.0
train_f1  std  0.0
---------------------------------
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_par

In [33]:
from sklearn.metrics import roc_curve, roc_auc_score

In [86]:
import pickle

In [88]:
s = pickle.dumps(pipelineList[0])

In [89]:
from sklearn.externals import joblib


In [92]:
for i in range(len(pipelineList)):
    joblib.dump(pipelineList[i], "pipeline[" +str(i)+ "].joblib") 