In [2]:
import numpy as np
from time import time
import csv
import json
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import nltk
import codecs
import pandas as pd



In [3]:
data = pd.read_csv("./data/processed/final_dataset.csv",encoding="utf-8")
print(data.dtypes)
print(data.shape)

Unnamed: 0         int64
attitude          object
hashtags          object
id                 int64
processed_text    object
raw_text          object
seed_topic        object
topic             object
dtype: object
(123457, 8)


In [16]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

originalclass = []
predictedclass = []
inner_cv = 0

def classification_report_with_accuracy_score(y_true, y_pred):
    global originalclass
    global predictedclass
    global inner_cv
    inner_cv += 1
    print("CrossVal Iteration",inner_cv)
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return metrics.accuracy_score(y_true, y_pred) # return accuracy score

def benchmark(clf,name):
    global originalclass
    global predictedclass
    global inner_cv
    
    originalclass = []
    predictedclass = []
    inner_cv = 0
    
    cross_validation_k = 10 
    print('_' * 80)
    print("Cross Validating: ")
    print(clf)
    t0 = time()
    
    cross_scores = cross_val_score(clf,X,y,cv=cross_validation_k,scoring=metrics.make_scorer(classification_report_with_accuracy_score))
    print("cross validation scores: %s" % cross_scores)
    train_time = (time() - t0)/cross_validation_k
    print("time: %0.3fs" % train_time)

    print(metrics.classification_report(originalclass, predictedclass)) 
    
    score = cross_scores.mean()

    print("accuracy mean:   %0.3f" % score)

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time

y = data["attitude"]

In [5]:
text_vectorizer = TfidfVectorizer( max_df=1.0,min_df=1,ngram_range=(1,3),tokenizer=tokenize,max_features=50000)
X = text_vectorizer.fit_transform(data['processed_text'])
print("X vector n_samples: %d, n_features: %d" % X.shape)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X vector n_samples: 123457, n_features: 2207827


In [18]:
results = []
for clf, name in (
        #(MultinomialNB(alpha=0.2),'MultinomialNB'),
        #(BernoulliNB(alpha=0.45),'BernoulliNB'),
        #(RidgeClassifier(tol=1e-2, solver="lsqr",class_weight='balanced',normalize=False), "Ridge Classifier"),
        #(Perceptron(n_iter=33,penalty=None,class_weight='balanced'), "Perceptron"),
        #(PassiveAggressiveClassifier(n_iter=32,class_weight='balanced',C=1,loss="hinge"), "Passive-Aggressive"),
        #(KNeighborsClassifier(n_neighbors=20,weights="distance"), "kNN"),
        #(GradientBoostingClassifier(n_estimators=8),"GradientBoosting"),
        #(SGDClassifier(alpha=.0001,learning_rate='optimal', n_iter=40,class_weight='balanced',penalty='l2',loss='squared_hinge'),'SGDClassifier'),
        (LinearSVC(loss='squared_hinge', penalty='l2',class_weight='balanced',dual=False, tol=1e-3,C=0.5),'LinearSVC'),
        #(RandomForestClassifier(class_weight='balanced',criterion='gini',n_estimators=40,max_features='sqrt',min_samples_split=7,min_samples_leaf=3,n_jobs=-1),'RandomForest'),
        
        ):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name))

Ridge Classifier
________________________________________________________________________________
Cross Validating: 
RidgeClassifier(alpha=1.0, class_weight='balanced', copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=None, solver='lsqr', tol=0.01)




('CrossVal Iteration', 1)
('CrossVal Iteration', 2)
('CrossVal Iteration', 3)
cross validation scores: [0.81085219 0.8123056  0.81437111]
time: 7.690s
             precision    recall  f1-score   support

   negative       0.72      0.69      0.71     40516
   positive       0.85      0.87      0.86     82941

avg / total       0.81      0.81      0.81    123457

accuracy mean:   0.813
()


# Dataset Size Influence on Results
Testing how the size of the data set influences the overall result.
It seems obvious that a bigger dataset results in overall better classification results.
Because later on, the individual topics are trained and tested, we need to take the influence of the dataset size into account.
On average, each topic only has ca. 15.000 tweets. Thus, the results can not be compared to a model which has taken the full corpus of tweets (ca. 123.000) into account.

The total size of the data set will be splitted into 10 linear sizes.
For each size, 3 stratified shuffle splits will be evaluated using a 10-Fold cross validation each.
Example:
Size = 10.000
Then, 3 new datasets are generated by a stratified shuffle with the size of 10.000 tweets.
Each of the 3 new datasets will be evaluated using 10-Fold Cross Validation.

This results in a total of 30 evaluations for each size and an overall evaluation of 300 training and testing steps.
For each size, all results are aggregated by mean in order to receive the most accurate result.

In [52]:
def benchmarkTrainSize(model, X, y, n):
    train_size = (n / float(len(y)))
    scores = []
    for train, _ in StratifiedShuffleSplit(y, n_iter=3, train_size=train_size):
        text_vectorizer = TfidfVectorizer( max_df=1.0,min_df=1,ngram_range=(1,3),tokenizer=tokenize)
        X_train = text_vectorizer.fit_transform(data["processed_text"][train])
        print(X_train.shape)
        scores.append(cross_val_score(clf,X[train],y[train],cv=10,scoring='f1_weighted',n_jobs=-1,verbose=10).mean())
        print(np.mean(scores))
    return np.mean(scores)

train_size_results = []
train_size_steps = 10
train_sizes = np.linspace(start=len(y)/train_size_steps,stop=len(y),num=train_size_steps)[:-1]

for n in train_sizes[0:1]:
    size = int(n)
    print(size)
    train_size_results.append({'accuracy':benchmarkTrainSize(LinearSVC(loss='squared_hinge', penalty='l2',class_weight='balanced',dual=False, tol=1e-3,C=0.6),X,y,n),'train_size':size})
    print(train_size_results)


12345
(12345, 321517)
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.757309, total=   4.7s
[CV]  ................................................................
[CV] ................................. , score=0.764754, total=   4.9s
[CV]  ................................................................
[CV] ................................. , score=0.783183, total=   5.0s
[CV] ................................. , score=0.801373, total=   4.9s
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.792841, total=   4.8s
[CV]  .................................................

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.6s remaining:    9.6s


[CV] ................................. , score=0.752019, total=   5.1s
[CV]  ................................................................
[CV] ................................. , score=0.786542, total=   5.1s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   10.4s remaining:    4.4s


[CV] ................................. , score=0.774328, total=   5.9s
[CV] ................................. , score=0.788692, total=   3.1s
[CV] ................................. , score=0.771520, total=   3.8s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   13.5s finished


0.777256054017878
(12345, 325362)
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.775563, total=   4.6s
[CV]  ................................................................
[CV] ................................. , score=0.762304, total=   5.3s
[CV]  ................................................................
[CV] ................................. , score=0.781600, total=   5.3s
[CV] ................................. , score=0.772247, total=   5.3s
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.800206, total=   5.1s
[CV]  .....................................

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.9s remaining:    9.9s


[CV] ................................. , score=0.759715, total=   5.7s
[CV]  ................................................................
[CV] ................................. , score=0.766389, total=   6.5s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   12.1s remaining:    5.2s


[CV] ................................. , score=0.768103, total=   6.7s
[CV] ................................. , score=0.771194, total=   4.8s
[CV] ................................. , score=0.788599, total=   4.0s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.3s finished


0.7759239282175623
(12345, 323712)
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.783715, total=   5.9s
[CV]  ................................................................
[CV] ................................. , score=0.778911, total=   6.3s
[CV]  ................................................................
[CV] ................................. , score=0.779646, total=   7.0s
[CV]  ................................................................
[CV] ................................. , score=0.778054, total=   6.9s
[CV]  ................................................................
[CV] ................................. , score=0.793922, total=   4.8s
[CV]  ....................................

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.0s remaining:   11.0s


[CV] ................................. , score=0.765993, total=   5.7s
[CV]  ................................................................
[CV] ................................. , score=0.773879, total=   6.3s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.5s remaining:    5.8s


[CV] ................................. , score=0.781420, total=   6.5s
[CV] ................................. , score=0.788408, total=   4.2s
[CV] ................................. , score=0.802674, total=   3.5s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.8s finished


0.7781699976344902
[{'train_size': 12345, 'accuracy': 0.7781699976344902}]


# Testing Baseline Against Manually Labelled Tweets

In [41]:
manually_labelled = pd.read_csv("./data/manually_labelled.csv",encoding="utf-8")
print(manually_labelled.dtypes)
print(manually_labelled.shape)

attitude           object
hashtags           object
id                float64
processed_text     object
raw                object
seed_topic         object
topic              object
manualAttitude     object
dtype: object
(473, 8)


In [36]:
text_vectorizer = TfidfVectorizer( max_df=1.0,min_df=1,ngram_range=(1,3),tokenizer=tokenize)
text_vectorizer.fit(data["processed_text"])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize at 0x13f98dc08>, use_idf=True,
        vocabulary=None)

In [43]:
X_test = text_vectorizer.transform(manually_labelled["processed_text"])
y_test = manually_labelled["manualAttitude"]

#exclude of tweets from training which should be tested
train = data[~data.id.isin(manually_labelled["id"])]

X_train = text_vectorizer.transform(train["processed_text"])
y_train = train["attitude"]

In [45]:
clf_man = LinearSVC(loss='squared_hinge', penalty='l2',class_weight='balanced',dual=False, tol=1e-3,C=0.5)
for _ in range(1):
    clf.fit(X_train,y_train)
    predicted = clf.predict(X_test)
    print(metrics.classification_report(y_test, predicted))
    print(metrics.accuracy_score(y_test, predicted))

             precision    recall  f1-score   support

   negative       0.94      0.76      0.84       256
   positive       0.77      0.94      0.84       217

avg / total       0.86      0.84      0.84       473

0.8414376321353065
