In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
from string import punctuation
import os
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from pandas import DataFrame

Using TensorFlow backend.


In [193]:
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')) + '/data/'
data = pd.read_csv(datadir + 'labeled_data.csv', sep=',',index_col=0)
data['class'].describe().transpose()

count    24783.000000
mean         1.110277
std          0.462089
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          2.000000
Name: class, dtype: float64

In [194]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
def clean_twitter(data):
    tweets=data['tweet']
    table = str.maketrans('', '', punctuation)
    stop_words = set(stopwords.words('english'))
    lemma = nltk.wordnet.WordNetLemmatizer()
    cleandata = []

    for tweet in tweets:
        tokens = tweet.split()
        cleanstring = []
        for o in tokens:
            # Clearn words that start as @
            if o.startswith('@'): continue
            # remove punctuation from each token
            o = o.translate(table)  
            # filter out short tokens 
            if len(o) < 2: continue
            # remove remaining tokens that are not alphabetic
            if not o.isalpha(): continue
            # filter out stop words
            if o in stop_words: continue
            o = lemma.lemmatize(o)
            # change to lowercase
            o = o.lower()
            cleanstring.append(o)
        #convert the array to string
        cleanstring=' '.join(cleanstring)
        cleandata.append(cleanstring)
    return cleandata

In [209]:
#test data
#need to be modified to cover the whole dataset!!!
raw_data=data[:5000]
twitter_cleaned=clean_twitter(raw_data)

In [210]:
#output matrix

def prepare_data(data, mode):
	# create the tokenizer
	tokenizer = Tokenizer()
	# fit the tokenizer on the documents
	tokenizer.fit_on_texts(data)
	# encode training data set
	data_m = tokenizer.texts_to_matrix(data, mode=mode)
	return data_m

In [211]:
#convert the data into matrix
X = prepare_data(twitter_cleaned, 'binary')

In [212]:
Y=raw_data['class']
kfolds = KFold(raw_data.shape[0], n_folds = 4)

In [213]:
param_grid_lr = {'C':[10**i for i in range(-3, 3)], 'penalty':[ 'l2'], 'multi_class':['multinomial'], 'solver':['newton-cg']}

#2nd, call the GridSearchCV class, use LogisticRegression and 'roc_auc' for scoring
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv = kfolds,scoring='neg_log_loss') 
lr_grid_search.fit(X, Y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=5000, n_folds=4, shuffle=False, random_state=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'solver': ['newton-cg'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'multi_class': ['multinomial']},
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=0)

In [214]:
best_1 = lr_grid_search.best_score_
#number of obs to be test !!!
test_index=int (1/10*raw_data.shape[0])
#print(best_1)
X_train = X[test_index:]
Y_train = Y[test_index:]
X_test=X[:test_index]
Y_test=Y[:test_index]

lr_best = LogisticRegression(**lr_grid_search.best_params_ ) 
lr_best.fit(X_train, Y_train)
pred_lr=lr_best.predict(X_test)
pred_lr
Y_test

0      2
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
478    1
479    1
480    1
481    2
482    1
483    1
484    1
485    1
486    1
487    1
488    1
489    1
490    1
491    1
492    1
493    1
494    1
495    1
496    1
497    1
498    1
499    1
500    1
501    1
502    2
503    1
504    1
505    1
506    1
507    1
Name: class, Length: 500, dtype: int64

In [None]:
from sklearn import metrics
def printreport(pred, true)
y_true = true
y_pred = pred
target_names = ['class 2', 'class 1', 'class 0']
print (metrics.confusion_matrix(y_true, y_pred ), labels=target_names)
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

In [None]:
from sklearn import svm
gammas = [0.001, 0.01, 0.1, 1]
param_grid_svm = {}
svm_grid_search = GridSearchCV(svm.SVC(kernel='rbf',decision_function_shape='ovo',probability=True), param_grid_svm, cv=kfolds, scoring='neg_log_loss')
svm_grid_search.fit(X, Y)
svm_grid_search.best_params_


In [130]:
print (svm_grid_search.best_score_)

-0.3982564861800524


In [None]:
# prepare labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

modes = ['binary', 'count', 'tfidf', 'freq']
results = DataFrame()
for mode in modes:
	# prepare data for mode
	Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
	# evaluate model on data for mode
	results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
# summarize results
print(results.describe())
# plot results
results.boxplot()
pyplot.show()