In [0]:
from google.colab import drive
drive.mount('drive')

In [0]:
# Run this cell and select the kaggle.json file downloaded
# from the Kaggle account settings page.
from google.colab import files
files.upload()

In [0]:
# Let's make sure the kaggle.json file is present.
!ls -lha kaggle.json

# Next, install the Kaggle API client.
!pip install -q kaggle


# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

-rw-r--r-- 1 root root 69 Aug 24 15:06 kaggle.json


In [0]:
!kaggle competitions download -c facebook-recruiting-iii-keyword-extraction

Downloading Train.zip to /content
100% 2.18G/2.19G [00:42<00:00, 115MB/s]
100% 2.19G/2.19G [00:42<00:00, 55.0MB/s]
Downloading Test.zip to /content
 99% 720M/725M [00:07<00:00, 82.3MB/s]
100% 725M/725M [00:07<00:00, 95.6MB/s]
Downloading SampleSubmission.csv to /content
 99% 78.0M/78.7M [00:00<00:00, 84.8MB/s]
100% 78.7M/78.7M [00:00<00:00, 114MB/s] 


In [0]:
!unzip \*.zip

Archive:  Test.zip
  inflating: Test.csv                

Archive:  Train.zip
  inflating: Train.csv               

2 archives were successfully processed.


In [0]:
import pandas as pd
import numpy as np


df = pd.read_csv("Train.csv")

print("Before: ", df.shape)

df.dropna(inplace=True)

print("After: ", df.shape)

df.reset_index(inplace=True)

df.drop(columns=['index'], inplace=True)

df.to_csv("Train_No_NaN.csv", index=False)

In [0]:
!cp Train_No_NaN.csv drive/My\ Drive/tcs/

In [0]:
all_tags = [tag for slist in df["Tags"].values for tag in slist.split()]
len(all_tags)

17409986

In [0]:
import collections

counter=collections.Counter(all_tags)

counter = { x:y for x, y in sorted(counter.items(), key=lambda x: x[1], reverse=True) }

top_tags = set(list(counter.keys())[:500])

In [0]:
req_indices=[]

for i in range(df.shape[0]):
  tags = set(df["Tags"][i].split())
  if tags.issubset(top_tags):
    req_indices.append(i)


req_indices


In [0]:
import random

req_indices_5L = random.sample(req_indices, k=500000)

In [0]:
req_indices_df = pd.DataFrame({"Indices": req_indices_5L})

req_indices_df.to_csv("req_indices.csv", index=False)

In [0]:
!cp req_indices.csv drive/My\ Drive/tcs/

In [0]:
df.drop(columns=["Id"], inplace=True)

df = df.iloc[req_indices_5L, :]

df.reset_index(inplace=True)
df.drop(columns=["index"], inplace=True)
df.to_csv("Train_Nan_5L.csv", index=True)

In [0]:
!cp Train_Nan_5L.csv drive/My\ Drive/tcs/

In [0]:
####################################################################################

In [0]:
import pandas as pd

df = pd.read_csv("Train_Nan_5L.csv", usecols=["Title", "Body", "Tags"])

In [0]:


import re

titles_list=[]
for i in range(df.shape[0]):
  x = re.sub(r"</{0,1}[\w]+>", " ", df["Title"][i] )
  x= re.sub(r"C\+\+", "CPP", x)
  x = re.sub(r"C#", "CSHARP", x)
  x = re.sub(r"[^a-zA-Z]+", " ", x)
  x = re.sub(r"[\s]+", " ", x)
  x = re.sub(r"(?:^| )+[a-bd-qs-zA-BD-QS-Z](?=$| )+", "", x)
  titles_list.append(x.lower())


df["Title"] = titles_list




In [0]:

import re
code_list=[]
bodies_list=[]
for i in range(df.shape[0]):
  code = str( re.findall(r"<code>(.*?)</code>", df["Body"][i], flags=re.DOTALL) ) or "No_code"
  x = re.sub(r"<code>(.*?)</code>", " ", df["Body"][i], flags=re.MULTILINE|re.DOTALL)
  x = re.sub(r"</{0,1}[\w]+>", " ", x )
  x = re.sub(r"C\+\+", "CPP", x)
  x = re.sub(r"C#", "CSHARP", x)
  x = re.sub(r"[^a-zA-Z]+", " ", x)
  x = re.sub(r"[\s]+", " ", x)
  x = re.sub(r"(?:^| )+[a-bd-qs-zA-BD-QS-Z]{1,2}(?=$| )+", "", x)


  bodies_list.append(x.lower())
  code_list.append(code.lower())

df["Body"] = bodies_list
df["Code"] = code_list



In [0]:
# code me 4+ letters waale characters alphabets only

import re
clean_code_list = []
for i in range(df.shape[0]):
 
  x = re.sub(r"[^a-zA-Z]+", " ", df["Code"][i])
  x = re.sub(r"\b\w{1,3}\b", " ", x)
  x = re.sub(r"[\s]+", " ", x)
  x = x.strip()


  clean_code_list.append(x)

df["Code"] = clean_code_list

In [0]:
df.head()

Unnamed: 0,Title,Body,Tags,Code
0,problem with outlook displaying an html message,sending html formatted email message using ma...,c# email outlook,mailmessage message mailmessage nmessage body ...
1,jquery two buttons making picture move from le...,want make picture stickman picture move from ...,javascript jquery html button,function personleft animate left nfunction per...
2,how to get dropdownlist selected value nested ...,have problem retrieving the value dropdownlis...,drop-down-menu,repeater categoryfood runat server itemtemplat...
3,custom cancel for futuretask,have implemented custom cancellation logic de...,java concurrency,
4,capture image show and send to server in android,hello want capture image click button and aft...,android,protected void onactivityresult requestcode re...


In [0]:
df.to_csv("Cleaned_Train.csv", index=True)

In [0]:
!cp Cleaned_Train.csv drive/My\ Drive/tcs/

In [0]:
##############################################################################################

In [0]:
df_tbc = pd.DataFrame()

df_tbc["TBC"] = df["Title"] + " " + df["Body"] + " " + df["Code"]


In [0]:
df_tbc.to_csv("TBC.csv", index=True)

!cp TBC.csv drive/My\ Drive/tcs/

In [0]:
# Total no of words
# Average words/ example


len_tbc = 0

for i in range(df_tbc.shape[0]):
  l = len(df_tbc["TBC"][i].split())

  len_tbc += l

print(len_tbc)

len_tbc/df_tbc.shape[0]

52776606


105.553212

In [0]:
##########################################################################################

In [0]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
words = word_tokenize(df_tbc.iloc[0,0])

t = " ".join(str(stemmer.stem(word)) for word in words if word not in stop_words )

print(t)

problem outlook display html messag send html format email messag use mailmessag class code follow see receiv messag outlook show whole html instead bold text messag wrong mailmessag messag mailmessag nmessag bodi html bodi test messag bodi html nmessag isbodyhtml true skip set irrelev nnew smtpclient send messag html bodi test messag bodi html


In [0]:
tok_stop_stem_list=[]

for i in range(df_tbc.shape[0]):
  words = word_tokenize(df_tbc.iloc[i,0])
  t = " ".join(str(stemmer.stem(word)) for word in words if word not in stop_words  )
  tok_stop_stem_list.append(t)
  if not i%100000:
    print(i)

0
100000
200000
300000
400000


In [0]:
df_tbc["TBC"] = tok_stop_stem_list
df_tbc["Tags"] = df["Tags"]

df_tbc.to_csv("TBC_tok_stop_stem.csv", index=True)

(500000, 2)


Unnamed: 0,TBC,Tags
0,problem outlook display html messag send html ...,c# email outlook
1,jqueri two button make pictur move left right ...,javascript jquery html button
2,get dropdownlist select valu nest repeat probl...,drop-down-menu
3,custom cancel futuretask implement custom canc...,java concurrency
4,captur imag show send server android hello wan...,android


In [0]:
!cp TBC_tok_stop_stem.csv drive/My\ Drive/tcs/

In [0]:
# Total no of words
# Average words/ example


len_tbc_new = 0

for i in range(df_tbc.shape[0]):
  l = len(df_tbc["TBC"][i].split())

  len_tbc_new += l

print("OLD:")
print("Total:", len_tbc)
print("Words/row:", len_tbc/df_tbc.shape[0])

print("NEW:")
print("Total:", len_tbc_new)
print("Words/row:", len_tbc_new/df_tbc.shape[0])

OLD:
Total: 52776606
Words/row: 105.553212
NEW:
Total: 40321128
Words/row: 80.642256


In [0]:
##########################################################################################3

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
# binary='true' will give a binary vectorizer
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
tags = vectorizer.fit_transform(df_tbc['Tags'])


In [0]:
# saving as npz

from scipy import sparse

sparse.save_npz("Tags_vectorized.npz", tags)

!cp Tags_vectorized.npz drive/My\ Drive/tcs/

In [0]:
# saving as csv

tags.to_csv("Tags_vectorized.csv", index=True)

!cp Tags_vectorized.csv drive/My\ Drive/tcs/

In [0]:
import pickle

pickle.dump(vectorizer.vocabulary_, open("tags_vectorizer.pickle", "wb"))

!cp tags_vectorizer.pickle drive/My\ Drive/tcs/


# to load
# loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))

In [0]:
import pandas as pd
df_tbc = pd.read_csv("drive/My Drive/tcs/TBC_tok_stop_stem.csv", usecols=["TBC"])

In [0]:
# Ngrams = (1,1)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0.00009,tokenizer = lambda x: x.split(), ngram_range=(1,1),max_features=25000)
X_data = vectorizer.fit_transform(df_tbc["TBC"])


In [0]:
import pickle

pickle.dump(vectorizer.vocabulary_, open("ngram_1_1_vectorizer.pickle", "wb"))

!cp ngram_1_1_vectorizer.pickle drive/My\ Drive/tcs/

In [0]:
# X_data = pd.DataFrame(X_data.toarray())

from scipy import sparse

sparse.save_npz("X_data_1_1.npz", X_data)

!cp X_data_1_1.npz drive/My\ Drive/tcs/


In [0]:
# Ngrams = (1,4)


vectorizer = CountVectorizer(min_df=0.00009,tokenizer = lambda x: x.split(), ngram_range=(1,4),max_features=25000)
X_data_1_4 = vectorizer.fit_transform(df_tbc["TBC"])

In [0]:
pickle.dump(vectorizer.vocabulary_, open("ngram_1_4_vectorizer.pickle", "wb"))
sparse.save_npz("X_data_1_4.npz", X_data_1_4)

!cp X_data_1_4.npz drive/My\ Drive/tcs/
!cp ngram_1_4_vectorizer.pickle drive/My\ Drive/tcs/

In [0]:
###########################################################################

In [0]:
# load X_data_1_1.npz
# load tags vectorized

In [0]:
# load X_data_1_4.npz
# load tags vectorized

In [0]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive


In [0]:
from scipy import sparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

In [0]:
############################################

# Logistic Regression - (1,1)



from scipy import sparse


X_train_1_1 = sparse.load_npz("drive/My Drive/tcs/X_data_1_1.npz")
y_train = sparse.load_npz("drive/My Drive/tcs/Tags_vectorized.npz")

clf_log_reg_1 = OneVsRestClassifier(LogisticRegression(C=0.1, penalty='l2', verbose=1))

clf_log_reg_1.fit(X_train_1_1, y_train)


import pickle
pickle.dump(clf_log_reg_1, open("1_log_reg_clf.pickle", "wb"))
!cp 1_log_reg_clf.pickle drive/My\ Drive/tcs/

In [0]:
# Logistic Regression - (1,4)

from scipy import sparse


X_train_1_4 = sparse.load_npz("drive/My Drive/tcs/X_data_1_4.npz")
y_train = sparse.load_npz("drive/My Drive/tcs/Tags_vectorized.npz")

clf_log_reg_2 = OneVsRestClassifier(LogisticRegression(C=0.1, penalty='l2'))

clf_log_reg_2.fit(X_train_1_4, y_train)


import pickle
pickle.dump(clg_log_reg_2, open("2_log_reg_clf.pickle", "wb"))

!cp 2_log_reg_clf.pickle drive/My\ Drive/tcs/

In [0]:
####################################

# SGDClassifier-(1,1) --------------- done   ------- discard

from scipy import sparse


X_train_1_1 = sparse.load_npz("drive/My Drive/tcs/X_data_1_1.npz")
y_train = sparse.load_npz("drive/My Drive/tcs/Tags_vectorized.npz")



clf_sgd_1 = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=1, penalty='l2',n_jobs=-1))
clf_sgd_1.fit(X_train_1_1, y_train)


import pickle
pickle.dump(clf_sgd_1, open("3_sgd_clf.pickle", "wb"))

!cp 3_sgd_clf.pickle drive/My\ Drive/tcs/

In [0]:
# SGDClassifier-(1,4) --------------- done ---------- discard

from scipy import sparse


X_train_1_4 = sparse.load_npz("drive/My Drive/tcs/X_data_1_4.npz")
y_train = sparse.load_npz("drive/My Drive/tcs/Tags_vectorized.npz")



clf_sgd_2 = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=1, penalty='l2',n_jobs=-1))
clf_sgd_2.fit(X_train_1_4, y_train)


import pickle
pickle.dump(clf_sgd_2, open("4_sgd_clf.pickle", "wb"))

!cp 4_sgd_clf.pickle drive/My\ Drive/tcs/

In [0]:
###########################################################

# SVC - (1,1)


from scipy import sparse


X_train_1_1 = sparse.load_npz("drive/My Drive/tcs/X_data_1_1.npz")
y_train = sparse.load_npz("drive/My Drive/tcs/Tags_vectorized.npz")


clf_svc = OneVsRestClassifier(SVC())

clf_svc.fit(X_train_1_1, y_train)


import pickle
pickle.dump(clf_svc, open("5_svc_clf.pickle", "wb"))

!cp 5_svc_clf.pickle drive/My\ Drive/tcs/



In [0]:
##############################
# Testing
from sklearn.feature_extraction.text import CountVectorizer
X_train_1_1 = sparse.load_npz("drive/My Drive/tcs/X_data_1_1.npz")


clf_sgd_1 = pickle.load(open("drive/My Drive/tcs/3_sgd_clf.pickle", "rb"))

loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("drive/My Drive/tcs/tags_vectorizer.pickle", "rb")))
loaded_vec._validate_vocabulary()


test = X_train_1_1[100:200, :]

pred = clf_sgd_1.predict(test)

loaded_vec.inverse_transform(pred)


[array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], dtype='<U24'),
 array([], d