In [1]:
import numpy as np 
import pandas as pd

In [2]:
tweetsDF = pd.read_csv("preprocessed.csv")

tweetsDF.drop(labels=["Unnamed: 0",
                      "airline", 
                      "negativereason", 
                      "airline_sentiment_confidence", 
                      "negativereason",
                      "airline_sentiment",
                      "text"], axis=1, inplace=True)

In [3]:
tweetsDF.head(2)

Unnamed: 0,negativereason_confidence,sentiment,tweet2words,num_capitalized,tweet_length,num_negative_words,num_positive_words,num_neutral_words,has_capitalized,num_capitalised_positive_words,num_capitalised_negative_words,num_hashtags,num_special_character
0,13.160358,1,What said,0,3,0,0,4,1,0,0,0,3
1,0.0,1,plus added commercials experience tacky,0,6,0,0,9,0,0,0,0,4


In [4]:
tweetsDF["tweet2words"] = tweetsDF["tweet2words"].values.astype("U")

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
target = "sentiment"
features = [c for c in tweetsDF.columns.values if c not in [target]]
numeric_features =  [c for c in tweetsDF.columns.values if c not in ['tweet2words', target]]


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(tweetsDF[features], tweetsDF[target], test_size=0.33, random_state=42)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

In [9]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
 

In [10]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
text = Pipeline([
                ('selector', TextSelector(key='tweet2words')),
                ('countVec', CountVectorizer(analyzer = "word"))
            ])

text.fit_transform(X_train)

<9808x10227 sparse matrix of type '<class 'numpy.int64'>'
	with 87497 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='negativereason_confidence')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-0.00158819],
       [-0.17092916],
       [-0.17537903],
       ...,
       [-0.16641288],
       [-0.16641288],
       [-0.00158819]])

In [14]:
from sklearn.pipeline import FeatureUnion

In [15]:
feats = FeatureUnion([('text', text)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<9808x10227 sparse matrix of type '<class 'numpy.int64'>'
	with 87497 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state = 42)),
])

pipeline.fit(X_train, Y_train)

preds = pipeline.predict(X_test)
np.mean(preds == Y_test)

0.8116721854304636

In [27]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [28]:
clfs = list()
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())

In [30]:
for c in clfs:
    pipeline.set_params(classifier = c)
    scores = cross_validate(pipeline, X_train, Y_train)
    print('---------------------------------')
    print(str(c))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())

---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  0.2993203004201253
fit_time  std  0.24355048146692487
score_time  mean  0.03563459714253744
score_time  std  0.0005173868599853422
test_score  mean  0.8068918506377988
test_score  std  0.0054046896093935416
train_score  mean  0.9445351173036453
train_score  std  0.0006120316421246042
---------------------------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
-----------------------------------
fit_time  mean  4.169328689575195
fit_time  std  0.01716559278210503
