Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score

In [None]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NEW_TRAIN.csv')
df['name'] = df['name']+" " + df['document_text'] # Concatenating Strings of  name and document text column into single column (name)
df.drop(['id','document_text','cat_name'], inplace = True, axis = 1)
df.head()

In [None]:
df.drop('Unnamed: 0',axis = 1, inplace=True) #dropping the Unnamed: 0

In [None]:
stop_words = set(stopwords.words('english')) #creating a set of stop words
def cleanResume(resumeText):
    resumeText=resumeText.lower()
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'\d*',r'', resumeText)  #removing the digits from the sentences
    resumeText = re.sub('\s+', ' ', resumeText) #removing extra spaces from the sentences
    word_tokens = word_tokenize(resumeText) #tokenizing the sentences
    filtered_sentence = [w for w in word_tokens if not w in stop_words] #Adding the other words, other than the stop words in a list
    resumeText=' '.join(filtered_sentence)
    return resumeText

In [None]:
df['name'] = df.name.apply(lambda x: cleanResume(x)) # applying cleanResume Function to every row in the "name" column of the dataframe

In [None]:
y=df.drop(columns=["name"], axis=1) # putting target columns in y variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['name'], y) #splitting the dataframe into train and test in the ratio of 80:20

In [None]:
stack = OneVsRestClassifier(
    StackingClassifier([
        ('logreg2', ),RandomForestClassifier(),
        ('lgb',lgb.LGBMClassifier(max_depth = -1, n_estimators = 500)),
      ]))
# Creating an object of OneVsRest Classifier of Stacking Classifier with RandomForest Classifier and Lgbm Classifier

In [None]:
stack_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, max_features=8000,ngram_range= (1, 2))), #creating an object of TFIDF Vectorizer, chosing the best parameters after  using GridSearch CV
    ('stack', stack),
])
# Creating a pipeline of Tfidf Vectorizer and Stack Object
stack_pipeline.fit(X_train, y_train)
# Fitting the Stack object with train data

In [None]:
predictions= stack_pipeline.predict(X_test) #Predicting on X_test

In [None]:
predictions = np.asarray(predictions)

In [None]:
y_test.shape

(2288, 50)

In [None]:
predictions.shape

(2288, 50)

In [None]:
#Same shape of Predictions and y_test

In [None]:
predictions

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
f1_score(y_test, predictions, average="macro") #Finding the Macro F1 Score.

0.8165097810583446

In [None]:
testing=pd.read_csv("/content/drive/MyDrive/test.csv") #Importing the testing File

In [None]:
testing['name'] = testing['name']+" " + testing['document_text']

In [None]:
testing['name'] = testing.name.apply(lambda x: cleanResume(x)) #Performing the same pre processing as on the training data

In [None]:
y_test_pred = stack_pipeline.predict(testing['name']) #Predicting the results using stack pipeline

In [None]:
y_test_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
ans = pd.DataFrame(y_test_pred, columns = y.columns)
ans.head() #Creating a dataframe for the predictions

Unnamed: 0,Corporate Communications,Securities Settlement,Antitrust,Financial Crime,Commodities Trading,Examinations,Insurance,Required Disclosures,Consumer protection,Market Risk,...,Accounting and Finance,Fraud,Broker Dealer,Securities Issuing,Risk Management,Forms,Definitions,Liquidity Risk,Money Services,Research
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Converting the multilable output into the required submission format.

In [None]:
ans['id'] = testing['id']

In [None]:
l1 = []
l2 = []
pred = pd.DataFrame()
for i,row in ans.iterrows():
  for j in range(len(y.columns)):
    l1.append(str(row['id']) + '_' + str(y.columns[j]))

    if(row[y.columns[j]] == 1):
       l2.append(1)
    else:
      l2.append(0)

pred['id'] = l1
pred['predictions'] = l2

In [None]:
pred

Unnamed: 0,id,predictions
0,4771_Corporate Communications,0
1,4771_Securities Settlement,0
2,4771_Antitrust,0
3,4771_Financial Crime,0
4,4771_Commodities Trading,0
...,...,...
249645,57235_Forms,0
249646,57235_Definitions,0
249647,57235_Liquidity Risk,0
249648,57235_Money Services,0


In [None]:
pred.to_csv("stacking_lbg_rf_with moc best params.csv",index=False) #Downloading the submission file.