In [8]:
import re

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from transformer import *
from classify import *
from analyze import *


In [30]:
#Evaluate unigram models using Naive Bayes, Logistic Regression, and SVM classifiers

def unigrams(X, y):
    uni_vect = CountVectorizer(encoding='utf-8', stop_words='english', ngram_range=(1, 1),
                               decode_error='ignore', strip_accents='ascii')
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

#Apply the remove_urls function to each tweet in the dataset
    
    X = np.vectorize(remove_urls)(X)
    
#Create a list of estimators and their associated parameter grids to search through
    
    model_list = [(('clf', MultinomialNB()),
                    {'clf__alpha': np.arange(0.25, 2.0, 0.25)}), 
                  (('clf', LogisticRegression(random_state=15)),
                    {'clf__penalty': ['l1', 'l2'],
                    'clf__C': np.arange(0.25, 2.0, 0.25)}),
                   (('clf', LinearSVC(random_state=15)),
                    {'clf__C': np.arange(0.25, 2.0, 0.25)})
                  ]
    unigram_results = list()

#Vectorize the dataset into unigrams
#Perform a 5-fold stratified cross-validation grid search to test each model and find the best parameter values
#Append the mean test score and model used to a results list

    
    for model in model_list:
        uni_pipe = Pipeline(steps=[('vect', uni_vect), model[0]])
        uni_model = GridSearchCV(uni_pipe, param_grid=model[1], scoring='accuracy', cv=skf, n_jobs=10)
        uni_model.fit(X, y)
        unigram_results.append((uni_model.best_score_, uni_model))

#Sort the results list so that the best performing model is first
#Perform a train-test split on the entire training dataset
#Refit the best performing model on the new training set and get predictions using the new test set

    
    unigram_results.sort(reverse=True)
    best_uni_model = unigram_results[0][1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=15)
    best_uni_model.fit(X_train, y_train)
    uni_pred = best_uni_model.predict(X_test)
    
#Find the 20 largest F-values and their associated feature names and column indices
    
    uni_features = SelectKBest(score_func=f_classif, k=20)
    uni_features.fit(uni_vect.fit_transform(X_train), y_train)
    uni_vect.get_feature_names()
    top_20 = np.argpartition(uni_features.scores_, -20)[-20:]
    
    features = list(zip(uni_features.scores_[top_20],
                        np.array(uni_vect.get_feature_names())[top_20]))
    features.sort(reverse=True)
    
#Return the average test accuracy of the best performing model, the top 20 features with their weights, and the confusion matrix
    
    return (unigram_results[0][0], features, confusion_matrix(y_test, uni_pred))


In [32]:
#Evaluate bigram models using Naive Bayes, Logistic Regression, and SVM classifiers

def bigrams(X, y):
    bi_vect = CountVectorizer(encoding='utf-8', stop_words='english', ngram_range=(2, 2),
                              decode_error='ignore', strip_accents='ascii')
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

#Apply the remove_urls function to each tweet in the dataset
    
    X = np.vectorize(remove_urls)(X)
    
#Create a list of estimators and their associated parameter grids to search through
    
    model_list = [(('clf', MultinomialNB()),
                    {'clf__alpha': np.arange(0.25, 2.0, 0.25)}), 
                  (('clf', LogisticRegression(random_state=15)),
                    {'clf__penalty': ['l1', 'l2'],
                    'clf__C': np.arange(0.25, 2.0, 0.25)}),
                   (('clf', LinearSVC(random_state=15)),
                    {'clf__C': np.arange(0.25, 2.0, 0.25)})
                  ]
    bigram_results = list()

#Vectorize the dataset into bigrams
#Perform a 5-fold stratified cross-validation grid search to test each model and find the best parameter values
#Append the mean test score and model used to a results list

    
    for model in model_list:
        bi_pipe = Pipeline(steps=[('vect', bi_vect), model[0]])
        bi_model = GridSearchCV(bi_pipe, param_grid=model[1], scoring='accuracy', cv=skf, n_jobs=10)
        bi_model.fit(X, y)
        bigram_results.append((bi_model.best_score_, bi_model))

#Sort the results list so that the best performing model is first
#Perform a train-test split on the entire training dataset
#Refit the best performing model on the new training set and get predictions using the new test set

    
    bigram_results.sort(reverse=True)
    best_bi_model = bigram_results[0][1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=15)
    best_bi_model.fit(X_train, y_train)
    bi_pred = best_bi_model.predict(X_test)
    
#Find the 20 largest F-values and their associated feature names and column indices
    
    bi_features = SelectKBest(score_func=f_classif, k=20)
    bi_features.fit(bi_vect.fit_transform(X_train), y_train)
    bi_vect.get_feature_names()
    top_20 = np.argpartition(bi_features.scores_, -20)[-20:]
    
    features = list(zip(bi_features.scores_[top_20],
                        np.array(bi_vect.get_feature_names())[top_20]))
    features.sort(reverse=True)

#Return the average test accuracy of the best performing model, the top 20 features and their weights, and the confusion matrix

    return (bigram_results[0][0], features, confusion_matrix(y_test, bi_pred))


In [34]:
#Evaluate trigram models using Naive Bayes, Logistic Regression, and SVM classifiers

def trigrams(X, y):
    tri_vect = CountVectorizer(encoding='utf-8', stop_words='english', ngram_range=(3, 3),
                               decode_error='ignore', strip_accents='ascii')
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

#Apply the remove_urls function to each tweet in the dataset
    
    X = np.vectorize(remove_urls)(X)
    
#Create a list of estimators and their associated parameter grids to search through
    
    model_list = [(('clf', MultinomialNB()),
                    {'clf__alpha': np.arange(0.25, 2.0, 0.25)}), 
                  (('clf', LogisticRegression(random_state=15)),
                    {'clf__penalty': ['l1', 'l2'],
                    'clf__C': np.arange(0.25, 2.0, 0.25)}),
                   (('clf', LinearSVC(random_state=15)),
                    {'clf__C': np.arange(0.25, 2.0, 0.25)})
                  ]
    trigram_results = list()

#Vectorize the dataset into trigrams
#Perform a 5-fold stratified cross-validation grid search to test each model and find the best parameter values
#Append the mean test score and model used to a results list

    
    for model in model_list:
        tri_pipe = Pipeline(steps=[('vect', tri_vect), model[0]])
        tri_model = GridSearchCV(tri_pipe, param_grid=model[1], scoring='accuracy', cv=skf, n_jobs=10)
        tri_model.fit(X, y)
        trigram_results.append((tri_model.best_score_, tri_model))

#Sort the results list so that the best performing model is first
#Perform a train-test split on the entire training dataset
#Refit the best performing model on the new training set and get predictions using the new test set

    
    trigram_results.sort(reverse=True)
    best_tri_model = trigram_results[0][1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=15)
    best_tri_model.fit(X_train, y_train)
    tri_pred = best_tri_model.predict(X_test)
    
#Find the 20 largest F-values and their associated feature names and column indices
    
    tri_features = SelectKBest(score_func=f_classif, k=20)
    tri_features.fit(tri_vect.fit_transform(X_train), y_train)
    tri_vect.get_feature_names()
    top_20 = np.argpartition(tri_features.scores_, -20)[-20:]
    
    features = list(zip(tri_features.scores_[top_20],
                        np.array(tri_vect.get_feature_names())[top_20]))
    features.sort(reverse=True)

#Return the average test accuracy of the best performing model, the top 20 features and their weights, and the confusion matrix

    return (trigram_results[0][0], features, confusion_matrix(y_test, tri_pred))


In [9]:
X_train, X_test, y_train, y_test = data_load('train_newline.txt', 'dev_newline.txt')


In [31]:
uni_accuracy, uni_features, uni_matrix = unigrams(X_train, y_train)

print('Best unigram model accuracy: ', uni_accuracy)
print('Top 20 unigram features: \n')

for feature in uni_features:
    print(feature)
    
print('Unigram model confusion matrix: \n', uni_matrix)


Best unigram model accuracy:  0.711275
Top 20 unigram features: 

(270.80972172840893, 'tcot')
(116.16442477018541, 'uniteblue')
(108.67366319808505, 'p2')
(83.867941299774074, 'dailykos')
(80.539265072797747, 'voteblue')
(77.203170675642397, 'republican')
(50.593496673136855, 'roc')
(49.822341989974547, 'alpolitics')
(49.33974062122131, 'teaparty')
(46.710506721754761, 'sayfie')
(46.201135432064966, 'dandc')
(44.345668087407731, 'pjnet')
(43.940708250175369, 'gapol')
(40.137570950793211, 'obama')
(38.244388776805863, 'utpol')
(34.017490902352684, 'ccot')
(31.872043105193164, 'equality')
(31.586163986676247, 'gop2012')
(30.733573967456909, 'crnc')
(30.691082840554412, 'jjauthor')
Unigram model confusion matrix: 
 [[3439 1446]
 [1479 3636]]


In [33]:
bi_accuracy, bi_features, bi_matrix = bigrams(X_train, y_train)

print('Best bigram model accuracy: ', bi_accuracy)
print('Top 20 bigram features: \n')

for feature in bi_features:
    print(feature)
    
print('Bigram model confusion matrix: \n', bi_matrix)


Best bigram model accuracy:  0.6426
Top 20 bigram features: 

(67.724711443326996, 'http dailykos')
(50.340675529443885, 'http tcot')
(32.545446096233405, 'sayfie http')
(26.216555715034289, 'http dandc')
(23.698454048968582, 'scott brown')
(23.660902802759356, 'tcot tlot')
(22.000230237364836, 'rt foxnews')
(21.109527304664933, 'rt gop')
(20.966079621025784, 'parikh daily')
(20.966079621025784, 'gaurav parikh')
(20.378959988369964, 'http stories')
(20.187511405332319, 'p2 uniteblue')
(20.08454490634993, 'county republican')
(19.126889758983726, 'tlot tcot')
(18.866893510042352, 'ritnews roc')
(18.588615311321057, 'gop http')
(18.173859134170382, 'rt thedemocrats')
(18.099539371888721, 'voteblue http')
(18.099539371888721, 'http politicususa')
(17.831526141171217, 'http http')
Bigram model confusion matrix: 
 [[2367 2518]
 [1108 4007]]


In [35]:
tri_accuracy, tri_features, tri_matrix = trigrams(X_train, y_train)

print('Best trigram model accuracy: ', tri_accuracy)
print('Top 20 trigram features: \n')

for feature in tri_features:
    print(feature)
    
print('Trigram model confusion matrix: \n', tri_matrix)


Best trigram model accuracy:  0.5603
Top 20 trigram features: 

(20.966079621025784, 'parikh daily http')
(20.966079621025784, 'gaurav parikh daily')
(17.211954378180863, 'http twurl nl')
(15.297518719554629, 'http gop2012 tcot')
(14.670241697400172, 'tennessee democrat http')
(14.670241697400172, 'inbox constantcontact http')
(14.670241697400172, 'http ritnews roc')
(14.670241697400172, 'democrat http stories')
(14.340488224933132, 'tru town films')
(14.340488224933132, 'gop2012 tcot tlot')
(14.340488224933132, 'bunker coyotered9 http')
(14.282232704066855, 'just posted photo')
(13.621436991509031, 'democratic underground http')
(13.621436991509031, 'ctl p2 uniteblue')
(11.524257270968477, 'tlot ctl p2')
(11.470145786590141, 'county republican party')
(10.513614574229878, 'tcot tlot gop')
(10.513614574229878, 'north carolina investigating')
(10.513614574229878, 'investigating kay hagan')
(10.475882197642132, 'weekend li blue')
Trigram model confusion matrix: 
 [[ 712 4173]
 [ 270 4845

In [4]:
#Find the best model parameters using Naive Bayes, Logistic Regression, and SVM models

def find_best_model(X, y):
    n_vect = CountVectorizer(encoding='utf-8', stop_words='english', decode_error='ignore',
                             strip_accents='ascii')
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
    
#Apply the remove_urls function to each tweet in the dataset
    
    X = np.vectorize(remove_urls)(X)
    
#Combine the array of vectorized text with an array containing additional writing style/content features
    
    union = FeatureUnion([('n_vect', n_vect), ('style_features', FeatureExtractor())])
    
#Create a list of estimators and their associated parameter grids to search through
#Search the same parameter grids over different n-gram combinations in case they improve performance

    
    model_list = [(('clf', MultinomialNB()),
                    {'clf__alpha': np.arange(0.25, 2.0, 0.25),
                     'union__n_vect__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2), (1, 3), (2, 3)],
                    }),
                  (('clf', LogisticRegression(random_state=15)),
                    {'clf__penalty': ['l1', 'l2'],
                    'clf__C': np.arange(0.25, 2.0, 0.25),
                    'union__n_vect__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2), (1, 3), (2, 3)]                   
                    }), 
                   (('clf', LinearSVC(random_state=15)),
                    {'clf__C': np.arange(0.25, 2.0, 0.25),
                     'union__n_vect__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2), (1, 3), (2, 3)]
                    })
                  ]
    
    ngram_results = list()

#Vectorize the dataset into unigrams
#Perform a 5-fold stratified cross-validation grid search to test each model and find the best parameter values
#Append each model and its mean test score to a list

    
    for model in model_list:
        n_pipe = Pipeline(steps=[('union', union),
                                 model[0]])
        n_model = GridSearchCV(n_pipe, param_grid=model[1], scoring='accuracy', cv=skf,
                               verbose=1, n_jobs=10)
        n_model.fit(X, y)
        ngram_results.append((n_model.best_score_, n_model))

#Sort the results list so that the best performing model is first
#Perform a train-test split on the entire training dataset
#Refit the best performing model on the new training set and get predictions using the new test set

    
    ngram_results.sort(reverse=True)
    best_model = ngram_results[0][1].best_estimator_
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=15)
    best_model.fit(X_train, y_train)
    best_pred = best_model.predict(X_test)
    
#Find the 20 largest F-values and their associated feature names and column indices
    
    n_features = SelectKBest(score_func=f_classif, k=20)
    n_features.fit(n_vect.fit_transform(X_train), y_train)
    top_20 = np.argpartition(n_features.scores_, -20)[-20:]
    
    features = list(zip(n_features.scores_[top_20],
                        np.array(n_vect.get_feature_names())[top_20]))
    features.sort(reverse=True)

#Print the best model and its associated optimal parameters    
#Return the average test accuracy of the best performing model, the top 20 features and their weights, and the confusion matrix
    
    return (ngram_results[0][0], features, confusion_matrix(y_test, best_pred))


In [6]:
best_accuracy, best_features, best_matrix = find_best_model(X_train, y_train)

print('Best n-gram model accuracy: ', best_accuracy)
print('Top 20 n-gram features: \n')

for feature in best_features:
    print(feature)
    
print('Best n-gram model confusion matrix: \n', best_matrix)


Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   30.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  2.7min
[Parallel(n_jobs=10)]: Done 210 out of 210 | elapsed:  3.1min finished


Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   34.3s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  5.8min
[Parallel(n_jobs=10)]: Done 420 out of 420 | elapsed: 17.2min finished


Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  7.8min
[Parallel(n_jobs=10)]: Done 210 out of 210 | elapsed:  9.0min finished


Best n-gram model accuracy:  0.7185
Top 20 n-gram features: 

(270.80972172840893, 'tcot')
(116.16442477018541, 'uniteblue')
(108.67366319808505, 'p2')
(83.867941299774074, 'dailykos')
(80.539265072797747, 'voteblue')
(77.203170675642397, 'republican')
(50.593496673136855, 'roc')
(49.822341989974547, 'alpolitics')
(49.33974062122131, 'teaparty')
(46.710506721754761, 'sayfie')
(46.201135432064966, 'dandc')
(44.345668087407731, 'pjnet')
(43.940708250175369, 'gapol')
(40.137570950793211, 'obama')
(38.244388776805863, 'utpol')
(34.017490902352684, 'ccot')
(31.872043105193164, 'equality')
(31.586163986676247, 'gop2012')
(30.733573967456909, 'crnc')
(30.691082840554412, 'jjauthor')
Best n-gram model confusion matrix: 
 [[3396 1489]
 [1389 3726]]


In [4]:
print('Best model test accuracy: ', best_model('train_newline.txt', 'dev_newline.txt'))
top_features, conf_matrix = contingency_matrix('model.pkl', 'test.pkl')

print('Top 20 n-gram features: \n')

for feature in top_features:
    print(feature)

print('Best n-gram model confusion matrix: \n', conf_matrix)


Best model test accuracy:  0.628
Top 20 n-gram features: 

(374.86902465625639, 'tcot')
(167.10234220909032, 'uniteblue')
(135.98224927300524, 'p2')
(112.66250893302396, 'voteblue')
(108.31607072289431, 'dailykos')
(90.062102847283427, 'http dailykos')
(85.934199847461727, 'republican')
(74.210790555055183, 'http tcot')
(71.757927421002378, 'roc')
(71.614476461460924, 'teaparty')
(66.095208185839184, 'alpolitics')
(63.009015091627745, 'dandc')
(58.548525727153155, 'sayfie')
(56.855083438833759, 'pjnet')
(50.059388638209946, 'gapol')
(48.209497373200705, 'obama')
(46.161888184233831, 'utpol')
(41.900431792274219, 'ccot')
(41.254243545006346, 'crnc')
(40.918250392611483, 'victory')
Best n-gram model confusion matrix: 
 [[1651 1020]
 [ 840 1489]]
