In [1]:
%matplotlib inline

In [2]:
import csv
import random
from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import pickle

## Reading File

In [3]:
with open('../labeling_tweets/Data/batch1_to_9.csv', 'rb') as csvfile:
    twitter_flu_reader = csv.reader(csvfile)
    twitter_flu_list = list(twitter_flu_reader)    
random.shuffle(twitter_flu_list)

In [4]:
len(twitter_flu_list)

4000

In [5]:
tweets= [row[1] for row in twitter_flu_list]
labels = [row[2] for row in twitter_flu_list]

## Exploration

In [6]:
# label values
print 'Label values: ', list(set(labels))

Label values:  ['Y', 'N']


In [7]:
print 'Number of positive labels: ' , labels.count("Y")

Number of positive labels:  454


## Splitting Files

In [8]:
def split_file(data, train_frac = 0.7, dev_frac = 0.15):   
    train_split_idx = int(train_frac * len(data))
    dev_split_idx = int ((train_frac + dev_frac)* len(data))
    train_data = data[:train_split_idx]
    dev_data = data[train_split_idx:dev_split_idx]
    test_data = data[dev_split_idx:]
    return train_data, dev_data, test_data

In [9]:
train_tweets, dev_tweets, test_tweets = split_file (tweets)
train_labels, dev_labels, test_labels = split_file (labels)
print 'Training set samples:', len (train_tweets)
print 'Dev set samples:', len (dev_tweets)
print 'Test set samples:', len (test_tweets)

Training set samples: 2800
Dev set samples: 600
Test set samples: 600


## Pre-Processing tweets

In [10]:
max_number_features = 10000

In [11]:
# TfidfVectorizer
# Convert all characters to lowercase before tokenizing (by default)
# tokenization (by default)
# max_features: consider the top max_features ordered by term frequency across the corpus
vectorizer = TfidfVectorizer(max_features=max_number_features,stop_words='english',max_df=0.9 )  
train_tweets_vector = vectorizer.fit_transform(train_tweets)
dev_tweets_vector = vectorizer.transform(dev_tweets)

## Logistic Regression

In [18]:
def PerformLogisticRegression(c, train_data, train_labels, dev_data, dev_labels):
    model = LogisticRegression(C=c ,class_weight='balanced')
    model.fit(train_data, train_labels)   
    predicted_labels = model.predict(dev_data)
    
    #scores
    score = metrics.f1_score(dev_labels,predicted_labels, pos_label = "Y")
    f1a, f1b =metrics.f1_score(dev_labels,predicted_labels, average=None)
    precision = metrics.precision_score(dev_labels,predicted_labels, pos_label = "Y")
    accuracy = np.mean(predicted_labels == dev_labels) 
    
    #roc_auc
    predicted_prob = model.predict_proba(dev_data) 
    fpr, tpr, thresholds = metrics.roc_curve(dev_labels, predicted_prob[:,1], pos_label = 'Y')
    roc_auc = metrics.auc(fpr, tpr)
    print ' c: %3.5f ,  accuracy: %3.5f , precision-score:%3.5f,  f1-score: %3.5f, (%3.5f,%3.5f)  roc_auc: %3.5f ' %(c,  accuracy,precision,score , f1a, f1b, roc_auc )
    return (score, precision, accuracy,  model)


In [20]:
 # looking for the best C value 
c_values =  [ 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0 ]
max_score = 0;
max_set =()
for c in c_values:
    score, precision, accuracy, model = PerformLogisticRegression(c, train_tweets_vector, train_labels, dev_tweets_vector, dev_labels)
    if (score > max_score):
        max_score = score
        max_set = (c,accuracy, score, precision, model)



 c: 0.00010 ,  accuracy: 0.61000 , precision-score:0.19286,  f1-score: 0.31579, (0.72727,0.31579)  roc_auc: 0.82281 
 c: 0.00100 ,  accuracy: 0.75333 , precision-score:0.25287,  f1-score: 0.37288, (0.84647,0.37288)  roc_auc: 0.82303 
 c: 0.01000 ,  accuracy: 0.84833 , precision-score:0.35644,  f1-score: 0.44172, (0.91225,0.44172)  roc_auc: 0.82342 
 c: 0.10000 ,  accuracy: 0.83333 , precision-score:0.32727,  f1-score: 0.41860, (0.90272,0.41860)  roc_auc: 0.82768 
 c: 0.50000 ,  accuracy: 0.84000 , precision-score:0.34545,  f1-score: 0.44186, (0.90661,0.44186)  roc_auc: 0.83229 
 c: 1.00000 ,  accuracy: 0.84500 , precision-score:0.35780,  f1-score: 0.45614, (0.90962,0.45614)  roc_auc: 0.83094 
 c: 5.00000 ,  accuracy: 0.84833 , precision-score:0.35354,  f1-score: 0.43478, (0.91242,0.43478)  roc_auc: 0.82126 
 c: 10.00000 ,  accuracy: 0.84667 , precision-score:0.34375,  f1-score: 0.41772, (0.91171,0.41772)  roc_auc: 0.81137 


In [21]:
print '\n best c = %3.2f, accuracy = %2.5f, F1-score = %2.5f, precision = %2.5f' % max_set[0:4]


 best c = 1.00, accuracy = 0.84500, F1-score = 0.45614, precision = 0.35780


In [22]:
#saving model for best f1-score 
LR_model = max_set[4]
output = open('twitter_flu_LR_classifier.pkl', 'wb')
pickle.dump(LR_model, output)
output.close()

In [23]:
#saving vectorizer
output = open('TfidfVectorizer.pkl', 'wb')
pickle.dump(vectorizer, output)
output.close()

In [49]:
#saving dev file just for cross-validation of pkl files
#def writeList(setName, filename):
#    with  open(filename, 'w') as fout:
#        writer = csv.writer(fout, delimiter=',',lineterminator='\n')
#        for row in setName:
#            writer.writerow([row])
#writeList(dev_tweets,  'dev_tweets.csv')
#writeList(dev_labels,  'dev_labels.csv')