In [1]:
import os
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [2]:
#http://www.sqlitetutorial.net/sqlite-python/create-tables/
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return None

In [3]:
#Taking 0.5 Million entries to a dataframe.
write_db = 'Titlemoreweight.db'
if os.path.isfile(write_db):
    conn_r = create_connection(write_db)
    if conn_r is not None:
        preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed limit 500000""", conn_r)
conn_r.commit()
conn_r.close()

In [4]:
preprocessed_data.head()

Unnamed: 0,question,tags
0,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding
1,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding columns
2,java.lang.noclassdeffounderror javax servlet j...,jsp jstl
3,java.sql.sqlexcept microsoft odbc driver manag...,java jdbc
4,better way updat feed fb php sdk better way up...,facebook api facebook-php-sdk


In [5]:
print("number of data points in sample :", preprocessed_data.shape[0])
print("number of dimensions :", preprocessed_data.shape[1])

number of data points in sample : 500000
number of dimensions : 2


__ Converting string Tags to multilable output variables __ 

In [6]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(preprocessed_data['tags'])

__ Selecting 500 Tags __

In [7]:
def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

def questions_explained_fn(n):
    multilabel_yn = tags_to_choose(n)
    x= multilabel_yn.sum(axis=1)
    return (np.count_nonzero(x==0))

In [8]:
questions_explained = []
total_tags=multilabel_y.shape[1]
total_qs=preprocessed_data.shape[0]
for i in range(500, total_tags, 100):
    questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3))

In [9]:
# we will be taking 500 tags
multilabel_yx = tags_to_choose(500)
print("number of questions that are not covered :", questions_explained_fn(500),"out of ", total_qs)

number of questions that are not covered : 45221 out of  500000


<h2>4.2 Split the data into test and train (80:20) </h2>

In [10]:
train_datasize=400000
x_train=preprocessed_data.head(train_datasize)
x_test=preprocessed_data.tail(preprocessed_data.shape[0] - train_datasize) #400000

y_train = multilabel_yx[0:train_datasize,:]
y_test = multilabel_yx[train_datasize:preprocessed_data.shape[0],:]

In [11]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)

Number of data points in train data : (400000, 500)
Number of data points in test data : (100000, 500)


<h3> Featurizing data with TfIdf vectorizer </h3>

In [12]:
start = datetime.now()
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_tfidf = vectorizer.fit_transform(x_train['question'])
x_test_tfidf = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:03:52.648371


In [13]:
print("Dimensions of train data X:",x_train_tfidf.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_tfidf.shape,"Y:",y_test.shape)

Dimensions of train data X: (400000, 94927) Y : (400000, 500)
Dimensions of test data X: (100000, 94927) Y: (100000, 500)


<h3> Featurizing data with BoW vectorizer </h3>

In [14]:
start = datetime.now()
vectorizer = CountVectorizer(min_df=0.00009, max_features=200000, ngram_range=(1,3),binary=True,tokenizer = lambda x: x.split())
vectorizer.fit(x_train['question'])
x_train_bow = vectorizer.transform(x_train['question'])
x_test_bow = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:04:44.483483


In [15]:
print("Dimensions of train data X:",x_train_bow.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_bow.shape,"Y:",y_test.shape)

Dimensions of train data X: (400000, 94927) Y : (400000, 500)
Dimensions of test data X: (100000, 94927) Y: (100000, 500)


## Applying Logistic Regression with HyperParameter Tuning on BOW

In [16]:
# Please write all the code with proper documentation
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm
from sklearn.metrics import f1_score


summary=[]
f1_test_bow_dict={}


alpha=[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
for i in tqdm(alpha):
    # create instance of model
    lr=OneVsRestClassifier(LogisticRegression(penalty='l2', C=i), n_jobs=-1)
    
     # fitting the model on crossvalidation train
    lr.fit(x_train_bow, y_train)
    
    # predict the response on the crossvalidation train
    pred = lr.predict(x_test_bow)
        
    f1 = f1_score(y_test, pred, average='micro')
    
    f1_test_bow_dict[i]=f1
    
    
    
    
print(f1_test_bow_dict)


100%|███████████████████████████████████████████████████████████████████████████████| 8/8 [11:31:56<00:00, 7752.95s/it]


{1e-05: 0.0, 0.0001: 0.03133729148996325, 0.001: 0.17032792430781216, 0.01: 0.3368799826286565, 0.1: 0.43849109024341437, 1: 0.4596690189404498, 10: 0.4549609431176056, 100: 0.4512246172427754}


### Find the best alpha value from the result

In [17]:
#https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
def find_highest_alpha(k_dict):
    k=max(k_dict, key=k_dict.get)
    return k 

print(find_highest_alpha(f1_test_bow_dict))

1


In [18]:
from sklearn.metrics import precision_score,recall_score,accuracy_score,hamming_loss


lr=OneVsRestClassifier(LogisticRegression(penalty='l2', C=find_highest_alpha(f1_test_bow_dict)), n_jobs=-1)
lr.fit(x_train_bow, y_train)
pred = lr.predict(x_test_bow)


In [19]:
accuracy_bow=accuracy_score(y_test, pred)
hamming_bow=hamming_loss(y_test,pred)

f1_score_bow_micro=f1_score(y_test, pred, average='micro')
precision_bow_micro=precision_score(y_test, pred, average='micro')
recall_bow_micro=recall_score(y_test, pred, average='micro')

f1_score_bow_macro=f1_score(y_test, pred, average='macro')
precision_bow_macro=precision_score(y_test, pred, average='macro')
recall_bow_macro=recall_score(y_test, pred, average='macro')

In [20]:
summary.append(['BOW','Logistic Regression',find_highest_alpha(f1_test_bow_dict),accuracy_bow,hamming_bow,f1_score_bow_micro,precision_bow_micro,recall_bow_micro,f1_score_bow_macro,precision_bow_macro,recall_bow_macro])

## Applying Logistic Regression with HyperParameter Tuning on TFIDF

In [21]:
f1_test_tfidf_dict={}


alpha=[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
for i in tqdm(alpha):
    # create instance of model
    lr=OneVsRestClassifier(LogisticRegression(penalty='l2', C=i), n_jobs=-1)
    
     # fitting the model on crossvalidation train
    lr.fit(x_train_tfidf, y_train)
    
    # predict the response on the crossvalidation train
    pred = lr.predict(x_test_tfidf)
        
    f1 = f1_score(y_test, pred, average='micro')
    
    f1_test_tfidf_dict[i]=f1
    
print(f1_test_tfidf_dict)


100%|████████████████████████████████████████████████████████████████████████████████| 8/8 [6:28:42<00:00, 4611.74s/it]


{1e-05: 0.0, 0.0001: 0.0, 0.001: 6.903772911896351e-05, 0.01: 0.04054684819957372, 0.1: 0.2584937108746632, 1: 0.44464504481268946, 10: 0.4842800906237535, 100: 0.47650294455176984}


## Find the best alpha

In [22]:
print(find_highest_alpha(f1_test_tfidf_dict))

10


In [23]:
lr=OneVsRestClassifier(LogisticRegression(penalty='l2', C=find_highest_alpha(f1_test_tfidf_dict)), n_jobs=-1)
lr.fit(x_train_tfidf, y_train)
pred = lr.predict(x_test_tfidf)

accuracy_tfidf=accuracy_score(y_test, pred)
hamming_tfidf=hamming_loss(y_test,pred)

f1_score_tfidf_micro=f1_score(y_test, pred, average='micro')
precision_tfidf_micro=precision_score(y_test, pred, average='micro')
recall_tfidf_micro=recall_score(y_test, pred, average='micro')

f1_score_tfidf_macro=f1_score(y_test, pred, average='macro')
precision_tfidf_macro=precision_score(y_test, pred, average='macro')
recall_tfidf_macro=recall_score(y_test, pred, average='macro')


In [24]:
summary.append(['TFIDF','Logistic Regression',find_highest_alpha(f1_test_tfidf_dict),accuracy_tfidf,hamming_tfidf,f1_score_tfidf_micro,precision_tfidf_micro,recall_tfidf_micro,f1_score_tfidf_macro,precision_tfidf_macro,recall_tfidf_macro])

## Applying Linear SVM with HyperParameter Tuning on BOW

In [25]:
# Please write all the code with proper documentation
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm
from sklearn.metrics import f1_score


f1_test_bow_dict={}


alpha=[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
for i in tqdm(alpha):
    # create instance of model
    sgd=OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1', alpha=i), n_jobs=-1)
    
     # fitting the model on crossvalidation train
    sgd.fit(x_train_bow, y_train)
    
    # predict the response on the crossvalidation train
    pred = sgd.predict(x_test_bow)
        
    f1 = f1_score(y_test, pred, average='micro')
    
    f1_test_bow_dict[i]=f1
    
    
    
    
print(f1_test_bow_dict)


100%|█████████████████████████████████████████████████████████████████████████████████| 8/8 [1:42:13<00:00, 811.17s/it]


{1e-05: 0.4195889729327029, 0.0001: 0.4342847526857868, 0.001: 0.32237153381177897, 0.01: 0.1633757591974595, 0.1: 0.0353479244091291, 1: 0.0, 10: 0.0, 100: 0.03419900912758285}


### Find the best alpha value from the result

In [26]:
#https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
def find_highest_alpha(k_dict):
    k=max(k_dict, key=k_dict.get)
    return k 

print(find_highest_alpha(f1_test_bow_dict))

0.0001


In [27]:
from sklearn.metrics import precision_score,recall_score,accuracy_score,hamming_loss


sgd=OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1', alpha=find_highest_alpha(f1_test_bow_dict)), n_jobs=-1)
sgd.fit(x_train_bow, y_train)
pred = sgd.predict(x_test_bow)


In [28]:
accuracy_bow=accuracy_score(y_test, pred)
hamming_bow=hamming_loss(y_test,pred)

f1_score_bow_micro=f1_score(y_test, pred, average='micro')
precision_bow_micro=precision_score(y_test, pred, average='micro')
recall_bow_micro=recall_score(y_test, pred, average='micro')

f1_score_bow_macro=f1_score(y_test, pred, average='macro')
precision_bow_macro=precision_score(y_test, pred, average='macro')
recall_bow_macro=recall_score(y_test, pred, average='macro')

In [29]:
summary.append(['BOW','Linear SVM',find_highest_alpha(f1_test_bow_dict),accuracy_bow,hamming_bow,f1_score_bow_micro,precision_bow_micro,recall_bow_micro,f1_score_bow_macro,precision_bow_macro,recall_bow_macro])

## Applying Linear SVM with HyperParameter Tuning on TFIDF

In [30]:
f1_test_tfidf_dict={}


alpha=[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
for i in tqdm(alpha):
    # create instance of model
    sgd=OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1', alpha=i), n_jobs=-1)
    
     # fitting the model on crossvalidation train
    sgd.fit(x_train_tfidf, y_train)
    
    # predict the response on the crossvalidation train
    pred = sgd.predict(x_test_tfidf)
        
    f1 = f1_score(y_test, pred, average='micro')
    
    f1_test_tfidf_dict[i]=f1
    
print(f1_test_tfidf_dict)


100%|█████████████████████████████████████████████████████████████████████████████████| 8/8 [1:27:00<00:00, 695.53s/it]


{1e-05: 0.4330509258869754, 0.0001: 0.33602794593704854, 0.001: 0.16853310711902633, 0.01: 0.011676644994539081, 0.1: 0.0, 1: 0.0, 10: 0.0, 100: 0.01404449295367725}


## Find the best alpha

In [31]:
print(find_highest_alpha(f1_test_tfidf_dict))

1e-05


In [32]:
sgd=OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1', alpha=find_highest_alpha(f1_test_tfidf_dict)), n_jobs=-1)
sgd.fit(x_train_tfidf, y_train)
pred = sgd.predict(x_test_tfidf)

accuracy_tfidf=accuracy_score(y_test, pred)
hamming_tfidf=hamming_loss(y_test,pred)

f1_score_tfidf_micro=f1_score(y_test, pred, average='micro')
precision_tfidf_micro=precision_score(y_test, pred, average='micro')
recall_tfidf_micro=recall_score(y_test, pred, average='micro')

f1_score_tfidf_macro=f1_score(y_test, pred, average='macro')
precision_tfidf_macro=precision_score(y_test, pred, average='macro')
recall_tfidf_macro=recall_score(y_test, pred, average='macro')


In [33]:
summary.append(['TFIDF','Linear SVM',find_highest_alpha(f1_test_tfidf_dict),accuracy_tfidf,hamming_tfidf,f1_score_tfidf_micro,precision_tfidf_micro,recall_tfidf_micro,f1_score_tfidf_macro,precision_tfidf_macro,recall_tfidf_macro])

## Conclusions

In [34]:
# Please compare all your models using Prettytable library
from prettytable import PrettyTable

x = PrettyTable()
x.field_names = ["Vectorizer", "Model", "HyperParameter","Accuracy","ham_loss","f1_score_tfidf_micro","precision_tfidf_micro","recall_tfidf_micro","f1_score_tfidf_macro","precision_tfidf_macro","recall_tfidf_macro"]

for each in summary:
    x.add_row(each)

print(x)

+------------+---------------------+----------------+----------+------------+----------------------+-----------------------+--------------------+----------------------+-----------------------+---------------------+
| Vectorizer |        Model        | HyperParameter | Accuracy |  ham_loss  | f1_score_tfidf_micro | precision_tfidf_micro | recall_tfidf_micro | f1_score_tfidf_macro | precision_tfidf_macro |  recall_tfidf_macro |
+------------+---------------------+----------------+----------+------------+----------------------+-----------------------+--------------------+----------------------+-----------------------+---------------------+
|    BOW     | Logistic Regression |       1        | 0.22311  | 0.00294636 |  0.4596690189404498  |   0.6340355350493767  |  0.36052171311532  |  0.3173558544567375  |  0.48177142093047165  |  0.2551698621845632 |
|   TFIDF    | Logistic Regression |       10       | 0.24792  | 0.00274068 |  0.4842800906237535  |   0.7000935779417206  | 0.3701700688099

In [35]:
df=pd.DataFrame(summary)

In [36]:
df.columns = ["Vectorizer", "Model", "HyperParameter","Accuracy","ham_loss","f1_score_tfidf_micro","precision_tfidf_micro","recall_tfidf_micro","f1_score_tfidf_macro","precision_tfidf_macro","recall_tfidf_macro"]

In [37]:
df

Unnamed: 0,Vectorizer,Model,HyperParameter,Accuracy,ham_loss,f1_score_tfidf_micro,precision_tfidf_micro,recall_tfidf_micro,f1_score_tfidf_macro,precision_tfidf_macro,recall_tfidf_macro
0,BOW,Logistic Regression,1.0,0.22311,0.002946,0.459669,0.634036,0.360522,0.317356,0.481771,0.25517
1,TFIDF,Logistic Regression,10.0,0.24792,0.002741,0.48428,0.700094,0.37017,0.367129,0.56205,0.287125
2,BOW,Linear SVM,0.0001,0.21414,0.002973,0.431753,0.643501,0.324857,0.257873,0.350675,0.238057
3,TFIDF,Linear SVM,1e-05,0.24785,0.002696,0.432024,0.807419,0.294911,0.264237,0.425278,0.217813
