In [1]:
#import libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from bs4 import BeautifulSoup
import nltk
import re
import string
import pickle

In [3]:
#load preprocessed data

X = pd.read_feather('./X.ftr')
y = pd.read_feather('./tags.ftr')

In [8]:
models = {} #dictionary to store the final trained models
num_tags = 100 #number of tags to be used
num_questions_per_tag = 1000 #number of questions(+ve and -ve each) to be used as data for classifier
num_words_per_tag = 100 #no.of features per tag

#get the 'num_tags' most frequently used tags

tags_count = y.groupby('Tag')['Id'].nunique().sort_values(ascending=False).reset_index(name='count').head(num_tags)
tags_to_use = tags_count['Tag']

#extract features and build model for each tag

for tag in tags_to_use:
    question_ids = set() #set of question ids for the current tag
    pos_tag_question_ids, neg_tag_question_ids = [], [] #lists of question ids of +ve and -ve examples respectively
    pos_count, neg_count, i = 0, 0, 0
    while (pos_count<num_questions_per_tag or neg_count<num_questions_per_tag) and i<y.shape[0]:
        if y.loc[y.index[i], 'Tag']==tag and pos_count<num_questions_per_tag:
            if y.loc[y.index[i], 'Id'] in neg_tag_question_ids:
                neg_tag_question_ids.remove(y.loc[y.index[i], 'Id'])
            pos_tag_question_ids.append(y.loc[y.index[i], 'Id'])
            pos_count+=1
        elif y.loc[y.index[i], 'Tag']!=tag and neg_count<num_questions_per_tag and y.loc[y.index[i], 'Id'] not in pos_tag_question_ids and y.loc[y.index[i], 'Id'] not in neg_tag_question_ids:
            neg_tag_question_ids.append(y.loc[y.index[i], 'Id'])
            neg_count+=1
        i+=1
    question_ids.update(pos_tag_question_ids)
    question_ids.update(neg_tag_question_ids)
    X_with_valid_tags = X[X['Id'].isin(question_ids)] #dataframe containing questions to be used for current tag
    
    word_features = set() #set of words to be used as features
    tag_word_features = {}
    for i in range(X_with_valid_tags.shape[0]):
        if X_with_valid_tags.loc[X_with_valid_tags.index[i], 'Id'] in pos_tag_question_ids:
            for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 0]):
                if word not in tag_word_features:
                    tag_word_features[word] = 1
                else:
                    tag_word_features[word] += 1
            for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 1]):
                if word not in tag_word_features:
                    tag_word_features[word] = 1
                else:
                    tag_word_features[word] += 1
    tag_word_features = sorted(tag_word_features.items(), key=lambda item: item[1], reverse=True)
    tag_word_features = [item[0] for item in tag_word_features[:num_words_per_tag]]
    word_features.update(tag_word_features)
    
    #populate the feature columns of the input dataframe
    for word in word_features:
        X_with_valid_tags[word] = 0
    for i in range(X_with_valid_tags.shape[0]):
        for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 0]):
            if word in word_features:
                X_with_valid_tags.loc[X_with_valid_tags.index[i], word] = 1
        for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 1]):
            if word in word_features:
                X_with_valid_tags.loc[X_with_valid_tags.index[i], word] = 1
    
    #populate the output labels
    y_with_valid_tags = pd.DataFrame()
    y_with_valid_tags[tag] = 0
    for i in range(X_with_valid_tags.shape[0]):
        if X_with_valid_tags.loc[X_with_valid_tags.index[i], 'Id'] in pos_tag_question_ids:
            y_with_valid_tags.loc[X_with_valid_tags.index[i], tag] = 1
        else:
            y_with_valid_tags.loc[X_with_valid_tags.index[i], tag] = 0
    
    #build and train the model and add it to the models and features used by them to the 'models' dictionary
    X_with_valid_tags.drop(['Id', 'Title', 'Body'], axis=1, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X_with_valid_tags, y_with_valid_tags, test_size=0.2, random_state=0)
    clf = LinearSVC()
    clf.fit(X_train, y_train)
    models[tag] = {'features':word_features, 'model': clf}
    print("Tag: " + tag, "Train Accuracy: " + str(clf.score(X_train, y_train)), "Test Accuracy: " + str(clf.score(X_test, y_test)))

In [7]:
#save the models as pickle

output = open('./models.pkl', 'wb')
pickle.dump(models, output)
output.close()