## Multiclass Topic Prediction on StackExchange data

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

### read in data

In [None]:
data = pd.read_csv("content_topic_prediction/Topic_prediction_data_train.txt",sep='\t',header=None)
data.columns = ['text','uid','type','timestamp']
labels = pd.read_csv("content_topic_prediction/Topic_prediction_labels_train.txt",sep='\t',header=None)
labels.columns = ['label']
data['label'] = labels['label']

In [None]:
data.head()

### drop null values

In [None]:
data = data[(data['text'].notnull())]

### convert text data to term frequency vectors.

In [None]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000)
train_data_features = vectorizer.fit_transform(data["text"])
pickle.dump(vectorizer,open("vectorizer.p","wb" ))
features= np.array(train_data_features)

In [14]:
features

array(<96269x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 3796534 stored elements in Compressed Sparse Row format>,
      dtype=object)

### Train a support vector classifier

In [18]:
clf_LinearSVC= LinearSVC(penalty='l2')
scores_LinearSVC= cross_val_score(clf_LinearSVC, train_data_features, data["label"], cv=5, scoring='accuracy')
print("Accuracy is {0}".format(scores_LinearSVC.mean()))

Accuracy is 0.7259146122199797


### Train a Logisitic Regression classifier. 

In [20]:
clf_LR= LogisticRegression(random_state=1,C=1,penalty='l2')
scores_LR=  cross_val_score(clf_LR, train_data_features, data["label"], cv=5, scoring='accuracy')
print("Accuracy is {0}".format(scores_LR.mean()))

Accuracy is 0.7248131364637498


### tune model paramaters

In [None]:
#lr = LogisticRegression(random_state=1)
#lr_params = {'penalty':('l1','l2'),'C':[0.1,1,10],'fit_intercept':(True,False)}
#lr_cv = GridSearchCV(lr,lr_params,cv=2)
#nb = MultinomialNB()
#nb_params = {'alpha':[0.001,0.5,1]}
#nb_cv = GridSearchCV(nb,nb_params,cv=2)
#svc = LinearSVC(random_state=1)
#svc_params = {'penalty':('l1','l2'),'C':[0.1,1,10]}
#svc_cv = GridSearchCV(lr,lr_params,cv=2)

#lr_cv.fit(train_data_features,data["label"])
#nb_cv.fit(train_data_features,data["label"])
#svc_cv.fit(train_data_features,data["label"])
#print(lr_cv.best_params_)
#print(nb_cv.best_params_)
#print(svc_cv.best_params_)

### Now, train an ensemble model, get cross validation accuracy scores using 5-fold validation

In [None]:
#classifier = VotingClassifier(estimators=[('lr',LogisticRegression(random_state=1,C=1,penalty='l2')),('nb',MultinomialNB(alpha=0.001)),('svc',LinearSVC(random_state=1,C=1,penalty='l2'))],voting='hard')
classifier = VotingClassifier(estimators=[('lr',LogisticRegression()),('nb',MultinomialNB()),('svc',LinearSVC())],voting='hard')
scores = cross_val_score(classifier, train_data_features, data["label"], cv=5, scoring='accuracy')

#save the classifier
#pickle.dump(classifier,open("ensemble_model.p","wb"))

### print average accuracy for ensemble.

In [15]:
print("Accuracy is {0}".format(scores.mean()))
#tf and logreg - .7202
#tfidf and logreg - .709
#tf and ensemble - .7512
#tfidf and ensemble - .7286
#tf and tuned ensemble - .751
#tf and ensemble and bi/tri grams - 0.7344

Accuracy is 0.7541272020289477


### Use TF-IDF scheme and include bi-grams and tri-grams features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfVect= TfidfVectorizer(min_df=.001, sublinear_tf = True,ngram_range=(1,3)).fit_transform(data["text"]).todense()

In [None]:
tfVect.shape

In [None]:
classifier_tid = VotingClassifier(estimators=[('lr',LogisticRegression(random_state=1,C=1,penalty='l2')),('nb',MultinomialNB(alpha=0.001)),('svc',LinearSVC(random_state=1,C=1,penalty='l2'))],voting='hard')
scores_tid = cross_val_score(classifier, tfVect, data["label"], cv=5, scoring='accuracy')

In [16]:
print("Accuracy is {0}".format(scores_tid.mean()))

Accuracy is 0.7325510633527312
