In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['placement_survey_data.csv']


Importing Necessary Libraries

In [2]:
import nltk, re, time
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import random

Importing Data

In [3]:
data = pd.read_csv("../input/placement_survey_data.csv")
data.head()

Unnamed: 0,Timestamp,Email Address,Mobile No.,Roll No.,Future Plans,Feedback regarding all placement related activities conducted till date.,How would you rate the placement department?,How would your rate your overall experience at NIST?,Please give any concerns or queries you might have for the Placement cell.
0,2/9/2019 13:51:23,hrudanginib@gmail.com,7978031518,201611905,Placement,More exams such as cocubes test might help.Eng...,4,4,Not excellent but okay.
1,2/9/2019 14:00:45,varunbeheralego@gmail.com,9870437076,201640285,Higher Studies,Lame.,1,1,Shit.
2,2/9/2019 15:26:41,samalsagar617@gmail.com,9556901600,201610212,Placement,Good but still need improvement,4,4,More focus should give to every individual
3,2/9/2019 15:28:40,ashutosh.satapathy1112@gmail.com,917537093814,201610548,Placement,Should improve,4,4,
4,2/9/2019 15:29:42,ajit0504follow@gmail.com,8249543292,201610087,Placement,No serious step towards secured placement,2,2,


Choosing Required Columns

In [4]:
data = data[['Feedback regarding all placement related activities conducted till date.','How would you rate the placement department?']]
data.columns = ['review','liked']
data.head()

Unnamed: 0,review,liked
0,More exams such as cocubes test might help.Eng...,4
1,Lame.,1
2,Good but still need improvement,4
3,Should improve,4
4,No serious step towards secured placement,2


Binarizing Class

In [5]:
data['liked'] = (data['liked'] > 2).astype(int)
data.head()

Unnamed: 0,review,liked
0,More exams such as cocubes test might help.Eng...,1
1,Lame.,0
2,Good but still need improvement,1
3,Should improve,1
4,No serious step towards secured placement,0


Cleaning Data

In [6]:
corpus = []
for i in range(len(data.index)):
    review = re.sub('[^a-zA-Z]', ' ', data['review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

Tokenization

In [7]:
cv = CountVectorizer(max_features = 252) #choosing top 'n' most frequent tokens
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:,-1].values
np.asarray(X.sum(axis=0)) #frequency of tokens

array([  2,  24,   1,   1,   1,   1,   1,   2,   1,   1,   1,   6,   1,
         1,   1,   1,   1,   1,   5,   1,   1,   2,   1,   2,   1,   1,
        13,   1,   1,   1,   2,   1,  10,   1,   1,   8,   4,   8,   7,
         1,   1,   1,   1,   1,   1,   1,   1,   1,  20,   5,   3,   3,
        13,   2,  13,  12,   4,   6,   6,   3,   2,   4,   3,   4,   2,
         3,   3,   3,   5,   3,  12,   3,   8,   2,   2,   2,   4,   2,
         1,   2,   3,   5, 151,   2,   5,   2,  18,   2,   2,   2,   2,
         2,   3,   8,   2,   4,   5,   4,   3,   2,   4,   3,   2,   2,
         2,   2,   2,   2,   2,   8,   4,   2,   2,   6,   4,   4,  15,
         3,   5,  11,   5,   3,   3,   2,  37,   2,   3,   3,   6,   2,
         9,   3,   1,   4,   2,   7,   2,   6,   1,   1,   3,   1,   1,
         1,   1,   1,   2,   1,   1,   1,   1,   1,   2,   1,   1,   1,
         3,   3,   1,   1,   1,   3,   1,   1,   3,   1,   1,   1,   3,
         1,   1,   1,   1,   1,   1,   1,   1,   9,   1,   2,   

Splitting Dataset into Train and Test

In [8]:
indices = range(len(data.index))
X_train, X_test,y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size = 0.3, random_state = random.randint(0, 1000))

Feature Scaling

In [9]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(float))
X_test = sc.transform(X_test.astype(float))

Training the Model

In [10]:
clf = MLPClassifier(solver='sgd', activation = 'tanh', alpha=1e-5, hidden_layer_sizes=(64, 64, 64, 32, 16, 4, 4, 4), learning_rate_init = 0.15, learning_rate = 'adaptive', max_iter = 10000, random_state = random.randint(0, 1000), verbose = False, shuffle = True, early_stopping = True)
clf.fit(X_train, y_train)

MLPClassifier(activation='tanh', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(64, 64, 64, 32, 16, 4, 4, 4),
       learning_rate='adaptive', learning_rate_init=0.15, max_iter=10000,
       momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
       power_t=0.5, random_state=943, shuffle=True, solver='sgd',
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

Testing Accuracy

In [11]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9662921348314607

In [12]:
d = {'test': y_test, 'pred': y_pred}
testpred = pd.DataFrame(data=d)
testpred.insert(0, column = 'review', value = data['review'][indices_test].values)
print(testpred.head())
testpred.to_csv("output.csv")

                                          review  test  pred
0                                        Average     1     1
1                            Not fully completed     1     1
2                              Nothing conducted     1     1
3                                       helpfull     1     1
4  We get Interactive and good ideas about this.     1     1
