### Installing kaggle to access API --> fetch data instead of download

In [2]:
! pip install kaggle



In [3]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/N/u/mtavilda/Carbonate/.kaggle’: File exists


In [4]:
! cp kaggle.json ~/.kaggle/
#! cp /kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

### Fetch the data from kaggle API

In [6]:
! kaggle competitions download facebook-recruiting-iii-keyword-extraction -f Train.zip
! kaggle competitions download facebook-recruiting-iii-keyword-extraction -f Test.zip

Train.zip: Skipping, found more recently modified local copy (use --force to force download)
Test.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==0.24.1
!pip install gensim

Found existing installation: scikit-learn 0.24.1
Uninstalling scikit-learn-0.24.1:
  Successfully uninstalled scikit-learn-0.24.1
Collecting scikit-learn==0.24.1
  Using cached scikit_learn-0.24.1-cp38-cp38-manylinux2010_x86_64.whl (24.9 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.24.1


In [8]:
!pip install scikit-multilearn



In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
import gensim
from gensim.models import Word2Vec
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import hamming_loss, accuracy_score,f1_score,recall_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /N/u/mtavilda/Carbonate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /N/u/mtavilda/Carbonate/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /N/u/mtavilda/Carbonate/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
! unzip Train.zip

Archive:  Train.zip
replace Train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [11]:
train_data = pd.read_csv("Train.csv")
len(train_data)

6034195

In [12]:
data = train_data.drop_duplicates(subset = ["Title","Tags"])
len(data)

4186184

Dropping rows with no tags

In [13]:
data = data.dropna(subset=['Tags'])

Using 200000 records for computation purposes

In [14]:
data_split = data.loc[:200000]
#data_split = data.copy()

In [15]:
data_split = data_split.reset_index(drop = True)

In [16]:
def preprocessing(columnname,name):
    cleantext = []
    for i in range(len(data_split)):
      cleantext.append(BeautifulSoup(data_split[columnname][i], "html").text)
    data_split[name] = cleantext
    data_split[name] = data_split[name].apply(lambda x : re.sub(r"[^a-zA-Z0-9]+", ' ', x))
    data_split[name] = data_split[name].apply(lambda s: s.lower() if type(s) == str else s)
    data_split[name] = data_split[name].apply(lambda x : word_tokenize(x))

    stopword = stopwords.words('english')
    data_split[name] = data_split[name].apply(lambda x : [word for word in x if not word in stopword or word=='r' or word=='c'])
    new_name = name + " " + "tokens"
    data_split[new_name] = lemmatize(name)

In [17]:
def lemmatize(name):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    lemmatizer = WordNetLemmatizer()

    stemmed = []

    for i in range(len(data_split)):
      ls = []
      for word, tag in pos_tag(data_split[name][i]):
        ls.append(lemmatizer.lemmatize(word, tag_map[tag[0]]))
      stemmed.append(ls)
    return stemmed

Cleaning the "Body" & "Title", removing html tags present

In [18]:
preprocessing('Body',"clean body text")
preprocessing("Title","clean title text")



In [19]:
data_split.head()

Unnamed: 0,Id,Title,Body,Tags,clean body text,clean body text tokens,clean title text,clean title text tokens
0,1,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...,php image-processing file-upload upload mime-t...,"[like, check, uploaded, file, image, file, e, ...","[like, check, upload, file, image, file, e, g,...","[check, uploaded, file, image, without, mime, ...","[check, upload, file, image, without, mime, type]"
1,2,How can I prevent firefox from closing when I ...,"<p>In my favorite editor (vim), I regularly us...",firefox,"[favorite, editor, vim, regularly, use, ctrl, ...","[favorite, editor, vim, regularly, use, ctrl, ...","[prevent, firefox, closing, press, ctrl, w]","[prevent, firefox, close, press, ctrl, w]"
2,3,R Error Invalid type (list) for variable,<p>I am import matlab file and construct a dat...,r matlab machine-learning,"[import, matlab, file, construct, data, frame,...","[import, matlab, file, construct, data, frame,...","[r, error, invalid, type, list, variable]","[r, error, invalid, type, list, variable]"
3,4,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ...",c# url encoding,"[probably, simple, simply, find, answer, basic...","[probably, simple, simply, find, answer, basic...","[replace, special, characters, url]","[replace, special, character, url]"
4,5,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...,php api file-get-contents,"[function, modify, mcontact, file, get, conten...","[function, modify, mcontact, file, get, conten...","[modify, whois, contact, details]","[modify, whois, contact, detail]"


Total number of tags and unique tags in 2 lakh data records


In [20]:
def get_unique_tags(df):
  tags = df['Tags'].str.cat(sep=' ')
  tags = list(tags.split(" "))
  print("Total tags present",len(tags))
  print("Total unique tags present:",len(list(set(tags))))
  return tags

In [21]:
tags = get_unique_tags(data_split)

Total tags present 571038
Total unique tags present: 23686


In [22]:
tag_count = {}
unique_tags = list(set(tags))

for tag in unique_tags:
  #print(tag)
  tag_count[tag] = tags.count(tag)


Creating a dataframe of unique tags and the number of times each tag is appears in the answer.

In [23]:
sorted_tags = pd.DataFrame(sorted(tag_count.items(), key=lambda item: item[1],reverse=True),columns=['tags','count'])

Total number of words and unique words in the training data

Plotting tags per answer and estimating the average number of tags given in each answer

Checking how many records are covered if we consider top 500 tags

In [24]:
top_tags = sorted_tags['tags'][:500]

In [25]:
count = 0
idx = []
for i in range(len(data_split['Tags'])):
  if any(item in data_split['Tags'][i] for item in top_tags):
    count += 1
    idx.append(i)
print(count)

195352


In [26]:
top_tags = sorted_tags['tags'][:500]
type(top_tags)
top_tags_list = list(top_tags)

#top500_tags_list

In [27]:
def find_most_frequent_tags(x):
  #print(x)
  list1 = x.split(" ")
  result = ""
  for tag in list1:
    #print(tag)
    if tag in top_tags_list:
      #print("in")
      result +=" " + tag
  return result

In [28]:
data_tags = data_split['Tags'].apply(lambda x: find_most_frequent_tags(x))
data_tags = data_tags.apply(lambda x: x if len(x)>0 else None)
data_new = pd.concat([data_split[['clean title text tokens','clean body text tokens']],data_tags],axis=1)
data_new.dropna(subset= ['Tags'],inplace = True)
data_new.reset_index(drop= True,inplace = True)


In [29]:
data_new.shape

(177270, 3)

In [30]:
data_preprocessed = data_new.copy()
data_preprocessed['body tokens'] = data_preprocessed['clean body text tokens'].apply(lambda x : " ".join(x))
data_preprocessed['title tokens'] = data_preprocessed['clean title text tokens'].apply(lambda x : " ".join(x))

data_preprocessed.head()


Unnamed: 0,clean title text tokens,clean body text tokens,Tags,body tokens,title tokens
0,"[check, upload, file, image, without, mime, type]","[like, check, upload, file, image, file, e, g,...",php image-processing file-upload upload,like check upload file image file e g png jpg ...,check upload file image without mime type
1,"[prevent, firefox, close, press, ctrl, w]","[favorite, editor, vim, regularly, use, ctrl, ...",firefox,favorite editor vim regularly use ctrl w execu...,prevent firefox close press ctrl w
2,"[r, error, invalid, type, list, variable]","[import, matlab, file, construct, data, frame,...",r matlab,import matlab file construct data frame matlab...,r error invalid type list variable
3,"[replace, special, character, url]","[probably, simple, simply, find, answer, basic...",c# url encoding,probably simple simply find answer basicaly wa...,replace special character url
4,"[modify, whois, contact, detail]","[function, modify, mcontact, file, get, conten...",php api,function modify mcontact file get content http...,modify whois contact detail


Converting tags for multilabel problems

In [31]:
unique_tags_preprocessed = get_unique_tags(data_preprocessed)

Total tags present 530560
Total unique tags present: 501


Vectorizing the Tags using CountVectorizer

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(data_preprocessed['Tags'])
#multilabel_y = vectorizer.fit_transform(uniq_tags_temp)
multilabel_y.toarray().shape

(177270, 500)

In [33]:
data_preprocessed.shape

(177270, 5)

Using TFIDF for clean text tokens(questions)

In [34]:
body_vectorizer = TfidfVectorizer()
title_vectorizer = TfidfVectorizer()

# fitting the tf-idf on the given data
body_vector = body_vectorizer.fit_transform(data_preprocessed['body tokens'])
title_vector = title_vectorizer.fit_transform(data_preprocessed['title tokens'])

In [35]:
from scipy.sparse import hstack
questions = hstack([body_vector,title_vector])

In [36]:
print(questions.shape)
print(multilabel_y.toarray().shape)

(177270, 514133)
(177270, 500)


### Splitting the data into test and train

In [38]:
x_train,x_test,y_train,y_test = train_test_split(questions,multilabel_y)

In [39]:
print("X train shape",x_train.shape)
print("Y train shape",y_train.shape)
print("X test shape",x_test.shape)
print("Y test shape",y_test.shape)

X train shape (132952, 514133)
Y train shape (132952, 500)
X test shape (44318, 514133)
Y test shape (44318, 500)


In [40]:
import time
a = time.time()
mlknn_classifier = MLkNN(k=2)
mlknn_classifier.fit(x_train,y_train)
print("Time taken to train" ,time.time()-a)



Time taken to train 2452.4246077537537


In [41]:
test_sent = ["corpus <- VCorpus(VectorSource(text)) tdm <- TermDocumentMatrix()"]
qs1 = body_vectorizer.transform(test_sent)
qs2 = title_vectorizer.transform(test_sent)
q = hstack([qs1,qs2])
q.shape

(1, 514133)

In [43]:
import numpy as np
op = mlknn_classifier.predict(q)
np.where(op.toarray() == 1)

(array([0]), array([495]))

In [44]:
vectorizer.inverse_transform(op)

[array(['xml'], dtype='<U25')]

### Checking the accuracy metric

In [45]:
pred = mlknn_classifier.predict(x_test)

In [46]:
pred.shape

(44318, 500)

In [47]:
accuracy_score(y_test,pred)
hamming_loss(y_test,pred)

0.00578081140845706

#### Classification report

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.81      0.37      0.51       174
           1       0.12      0.20      0.15      1287
           2       0.01      0.02      0.02        52
           3       0.01      0.02      0.02        50
           4       0.06      0.10      0.08        52
           5       0.14      0.26      0.18        34
           6       0.66      0.22      0.33        94
           7       0.11      0.13      0.12        63
           8       0.33      0.35      0.34       262
           9       0.38      0.43      0.40        98
          10       0.20      0.26      0.23        86
          11       0.16      0.24      0.19        55
          12       0.08      0.11      0.09        46
          13       0.24      0.41      0.30       506
          14       0.45      0.14      0.22        70
          15       0.17      0.22      0.19       242
          16       0.39      0.57      0.46        82
          17       0.22    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### GridSearch for optimal hyperparameters

In [49]:
parameters = {'k': range(1,3), 's': [0.5, 0.7, 1.0]}
score = 'f1_micro'

start=time.time()

classifier = GridSearchCV(MLkNN(), parameters, scoring=score)
classifier.fit(questions,multilabel_y)
print('training time taken: ',round(time.time()-start,0),'seconds')




training time taken:  98061.0 seconds


In [50]:
print('best parameters :', classifier.best_params_,
      'best score: ',classifier.best_score_)

best parameters : {'k': 2, 's': 0.5} best score:  0.30983725334516005
