In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import html
from pprint import pprint
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC, NuSVC 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.feature_selection import SelectFromModel

In [4]:
#Load training data & test data

names = ('polarity', 'id', 'date', 'query', 'author', 'text')
data_train = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1', names=names)
data_test = pd.read_csv('testdata.manual.2009.06.14.csv', names=names)


#Sample n from 160k tweets
data_train_sample = data_train.sample(50000)

#Split into X and y
text_train_all = data_train_sample['text']
target_train_all = data_train_sample['polarity'].values

#Split training csv into training and validation components

text_train_small, text_validation, target_train_small, target_validation = train_test_split(
    text_train_all, target_train_all, test_size=.5, random_state=42)

In [5]:
#Data cleaning cell

contractions = {
"ain't": "are not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",
"could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not",
"doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
"haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have",
"he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
"i'd": "I would","i'd've": "I would have","i'll": "I will","i'll've": "I will have","i'm": "I am",
"i've": "I have","isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will",
"it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
"might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have",
"mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have",
"o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not",
"sha'n't": "shall not","shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
"she'll": "she will","she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would",
"that'd've": "that would have","that's": "that is","there'd": "there would","there'd've": "there would have",
"there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will",
"they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not",
"we'd": "we would","we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are",
"we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have",
"what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have",
"where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have",
"who's": "who is","who've": "who have","why's": "why is","why've": "why have","will've": "will have",
"won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
"wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have",
"y'all're": "you all are","y'all've": "you all have","you'd": "you would","you'd've": "you would have",
"you'll": "you will","you'll've": "you will have","you're": "you are","you've": "you have"
}

#remove contractions
def contraction_remove(line):
    for word in line.split():
        if word.lower() in contractions:
            line = line.replace(word, contractions[word.lower()])
    return line

#general cleaning
def tweet_cleaner(text):
    #Remove &quot; or &amp;
    souped = html.unescape(text)
    #Remove @mentions
    souped = re.sub(r'@\w+','',souped)
    #Remove http / https links
    souped = re.sub(r'https?://\S*','',souped)
    #Remove all remaining numbers / non-letters
    souped = re.sub("[^a-zA-Z]",' ',souped)
    #All lower case
    souped = souped.lower()
    return(souped)


#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def stopword_remove(line):
    words = [w.strip() for w in line.split() if not w.strip() in stop_words]
    words = str.join(' ',words)
    return words

text_train_small = text_train_small.apply(lambda x: contraction_remove(x))
text_train_small = text_train_small.apply(lambda x: tweet_cleaner(x))
text_train_small = text_train_small.apply(lambda x: stopword_remove(x))

In [6]:
#Found logistic regression to be marginally better than SVM ('hinge') using grid search below.
#Best models found MultinomialNB and Logistic Regression (Maximum Entropy)
#max_df: ignore terms that have a document frequency strictly higher 
#than the given threshold.
#ngram_range: generate unigrams and bigrams
#use_idf=False: No inverse document frequency weighting, effectively just normalises

pipeline = Pipeline((
    ('vec', CountVectorizer(max_df=0.5,ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    #('feature_selection', SelectFromModel(LinearSVC())),
    #('clf', SGDClassifier(alpha=1e-4, penalty='l2', l1_ratio=0.3, loss='log')),
    ('clf', MultinomialNB())
))

%time pipeline.fit(text_train_small, target_train_small).score(text_validation, target_validation)

CPU times: user 2.8 s, sys: 86.5 ms, total: 2.89 s
Wall time: 2.84 s


0.73916

In [7]:
#Grid Search

parameters = {
    'clf__alpha': (0.25, 0.5, 0.75, 1),
    'clf__fit_prior': (True, False),
    }

if __name__ == "__main__":
    # Multiprocessing requires the fork to happen in a __main__ protected
    # block. n_jobs=-1 invokes multiprocessing

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    %time grid_search.fit(text_train_small, target_train_small)
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vec', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75, 1), 'clf__fit_prior': (True, False)}
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 




AttributeError: 'GridSearchCV' object has no attribute 'best_score_'