In [1]:
import pandas as pd
import numpy as np

import nltk as nlp
from nltk import word_tokenize
from nltk import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator

In [2]:
tokenizer = TweetTokenizer(reduce_len=True)

In [3]:
df = pd.read_csv("/home/ted/Desktop/Project 3/amazon_yelp_twitter2.csv")

In [4]:
df.head()

Unnamed: 0,sentiment,text
0,1,nearly perfect wheat-free bread mix
1,0,be aware: speakers not as advertised on amazon
2,0,results comedic at best
3,1,going to enjoy the sunshine while its here
4,1,i feel better now.


In [5]:
### Clean the Data

In [6]:
df = df.iloc[:,:2].drop_duplicates()

In [7]:
df2 = df[pd.notnull(df['text'])]

In [8]:
###Create a Model

In [9]:
df_train, df_test = train_test_split(df2, test_size=0.3, random_state=42)

In [10]:
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = SGDClassifier(),):
        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

In [11]:
pipeline = Pipeline([
    ('vectorize', TfidfVectorizer(tokenizer=tokenizer.tokenize)),
    ('clf', ClfSwitcher()),])

parameters = [
    {
        'clf__estimator': [MultinomialNB()],
    },
    {
        'clf__estimator': [LogisticRegression()],
    },
    {
        'clf__estimator': [SGDClassifier()],
    },
]

In [12]:
gscv = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, return_train_score=False, verbose=3)
gscv.fit(df_train.text, df_train.sentiment)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   6 | elapsed: 10.3min remaining: 20.6min
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 17.9min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorize',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                     

In [13]:
gscv.cv_results_

{'mean_fit_time': array([340.08255041, 471.05087054, 248.61485744]),
 'mean_score_time': array([264.3739419 , 271.09122169, 205.87077367]),
 'mean_test_score': array([0.77391349, 0.80119597, 0.77140628]),
 'param_clf__estimator': masked_array(data=[MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                    LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
                    SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
               power_t=0.5, rando