In [4]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
documents = pickle.load(open('documents.pl','rb')) #document is the contents of each tweet stored in rows of an array
tags = pickle.load(open('tags.pl','rb'))#tag cases

# FEATURES

In [6]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC

#from sklearn.model_selection import StratifiedKFold
BoW = CountVectorizer(documents, strip_accents='unicode', ngram_range=(1,3), min_df=3)#vectorizing the words within 1 to 3 alphabets (1-3Ngram) range 
X_bag = BoW.fit_transform(documents)
tf_transformer = TfidfTransformer().fit(X_bag) #weighted term frequency with their presence in each document
X_bag = tf_transformer.fit_transform(X_bag) 

In [7]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)#makng an array that will be our binary-Y (to be predicted) element. it basically
                                                    #says the cases that are priority while others are not priority.

In [8]:
sum(Y)/len(Y) # just to see how polirized is the data. too few priority cases.

0.1450676982591876

# CLASSIFIERS

In [15]:
#import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [17]:
##SVC for challenge 1A
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [18]:
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0) #test and train cases are made from within the data

results = np.zeros((k,3))
cv = CountVectorizer(min_df = 3, max_df = 10000, ngram_range=(1, 3), lowercase =True)
tfidfTransform = TfidfTransformer()

# iterate over two folds
compareScores = 0
finalSVCModel = LinearSVC(dual=False,tol=1e-3, class_weight = 'balanced')
for i, (train_ind, test_ind) in enumerate(kfold):
    model = LinearSVC(dual=False,tol=1e-3, class_weight = 'balanced') 
    
    trainContent = list( documents[i] for i in train_ind )
    train_counts = cv.fit_transform(trainContent)
    
    train_tfidf = tfidfTransform.fit_transform(train_counts)
    
    testContent = list(documents[i] for i in test_ind)
    test_counts = cv.transform(testContent)
    
    test_tfidf = tfidfTransform.transform(test_counts)
    
    trainLabel = list( Y[i] for i in train_ind )
    model.fit(train_tfidf, trainLabel)
    
    pred = model.predict(test_tfidf)
    
    testLabel = list( Y[i] for i in test_ind )
    currentScore = f1_score(testLabel, pred)
    
    if(currentScore > compareScores):
        compareScores = currentScore
        finalSVCModel = model
    
    print("F1 score:", currentScore)
    

filename = 'finalized_model_SVC.sav'
pickle.dump(finalSVCModel, open(filename, 'wb'))
pickle.dump(cv,open('countVectorizationSVC.sav','wb'))
pickle.dump(tfidfTransform,open('tfidfTransformSVC.sav','wb'))

F1 score: 0.4938679245283019
F1 score: 0.4915346805024576
F1 score: 0.4882916863114883


In [20]:
documents = pickle.load(open('documents_testing.pl','rb'))
content = list( documents[i] for i in range(len(documents)) )
counts = cv.transform(content)
    
tfidf = tfidfTransform.transform(counts)

pred = model.predict(tfidf)

## XG BOOST

In [19]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.07
params['max_depth'] = 50
params['min_child_weight'] = 6
params['lambda'] = 1
params['nthread'] = 4

In [None]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros((k,3))

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    print(X_resampled.shape, X_val.shape, Y_resampled.shape, Y_val.shape)
    
    D_train = xgb.DMatrix(X_resampled, label=Y_resampled)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10, )
    predictions = bst.predict(data=D_val).round()
    f1 = f1_score(Y_val, predictions)
    acc = accuracy_score(Y_val, predictions)
    prio_acc = accuracy_score(Y_val[Y_val==1], predictions[Y_val==1])
    results[i] = [f1, acc, prio_acc]
    print('f1 score: {}'.format(f1))
    print('accuracy: {}\%'.format(acc))
    print('Prio accuracy: {}'.format(prio_acc))
    print(confusion_matrix(Y_val, predictions))

(124348, 321767) (36363, 321767) (124348,) (36363,)
[0]	train-logloss:0.668219	valid-logloss:0.672347
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.516264	valid-logloss:0.548634
[20]	train-logloss:0.445863	valid-logloss:0.495861
[30]	train-logloss:0.406458	valid-logloss:0.468739
[40]	train-logloss:0.382029	valid-logloss:0.45344
[50]	train-logloss:0.367049	valid-logloss:0.44488
[60]	train-logloss:0.357191	valid-logloss:0.439303
[70]	train-logloss:0.349485	valid-logloss:0.435291
[80]	train-logloss:0.342704	valid-logloss:0.431736
[90]	train-logloss:0.335843	valid-logloss:0.428089
[100]	train-logloss:0.330533	valid-logloss:0.425454
[110]	train-logloss:0.325089	valid-logloss:0.422686
[120]	train-logloss:0.319438	valid-logloss:0.419606


In [None]:
np.mean(results), np.var(results)