In [1]:
#importing libraries
%matplotlib inline
%config InlineBackend.figure_format='retina'
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
#reading CSV file using pandas 
df = pd.read_csv('agr_en_train.csv', names = ['unique_id','comment','agr_lvl'])

In [3]:
#We can see how the first example looks like below
print(df.iloc[0])

unique_id                          facebook_corpus_msr_1723796
comment      Well said sonu..you have courage to stand agai...
agr_lvl                                                    OAG
Name: 0, dtype: object


In [4]:
# We store the reviews and labels in two arrays as follows:
reviews = df['comment'].values
labels_str = df['agr_lvl'].values

In [None]:
#printing the reviews
print(reviews,len(reviews))

In [10]:
#we convert the labels from array of string to array of integers for computation i.e. labels 
label2value_dic={'OAG':2,'CAG':1,'NAG':0}

labels=np.zeros(len(labels_str),dtype=int)
for tt,tkey in enumerate(label2value_dic.keys()):
    ind_this=labels_str==tkey
    labels[ind_this]=label2value_dic[tkey]
    #print(tt,tkey)
    
print(np.unique(labels))
print(labels_str)

[0 1 2]
['OAG' 'NAG' 'OAG' ... 'OAG' 'OAG' 'NAG']


In [11]:
#calculating the split size and splitting the index in k folds
#should be executed only once as it shuffles the data in random order in every execution 
class kfoldValidation:
    def __init__(self, datasize,k):
        self.k = k
        self.foldsize=np.int(np.floor(datasize*1.0/k))
        self.index_arr=np.arange(datasize)
        np.random.shuffle(self.index_arr)
        
    def getfold(self,x):
        if(self.k-1==x):
            index_test=self.index_arr[x*self.foldsize:]
            index_train=self.index_arr[:x*self.foldsize]
        else:
            index_test=self.index_arr[x*self.foldsize:((x+1)*self.foldsize)]
            index_train=np.append(self.index_arr[:x*self.foldsize],self.index_arr[(x+1)*self.foldsize:])

        return index_test, index_train



In [13]:
#this class is performing the training and validation on data

def perform_kfold_validation(kernel='linear',k=10):
    
    #kfoldValidation needs to be executed only once as it shuffles the data in every execution and updates the index
    kfold = kfoldValidation(len(reviews),k)

    for ii in range(k):
        
        print('working on fold (with kernel= %s):%d/%d'%(kernel,ii,k))
        
        #to get the test and training data index in each fold
        test_index,train_index=kfold.getfold(ii)


        #Feature Extraction/Vectorisation
        #it reduces the impact of tokens used very frequently
        vectorizer = TfidfVectorizer(min_df=5,sublinear_tf=True)

        #training the model on train data for each fold
        train_vectors = vectorizer.fit_transform(reviews[train_index])
        test_vectors = vectorizer.transform(reviews[test_index])

        #SVM RBF/Gaussian Kernel (default)
        classifier = svm.SVC(kernel=kernel)
        classifier.fit(train_vectors, labels[train_index]) #train classifier on train data
        prediction = classifier.predict(test_vectors) #predict test data

        #get classification results/evaluate model
        report = metrics.precision_recall_fscore_support(labels[test_index], prediction)
        report_accuracy = accuracy_score(labels[test_index], prediction)

        if(ii==0):
            precision=report[0]
            recall=report[1]
            f1=report[2]
            accuracy=np.array([report_accuracy])
        else:
            precision=np.row_stack([precision,report[0]])
            recall=np.row_stack([recall,report[1]])
            f1=np.row_stack([f1,report[2]])
            accuracy=np.append(accuracy,report_accuracy)     
           
    return precision, recall,f1,accuracy

#calling the above function and taking the value of k as 10
k=10
for kernel in ['linear','poly','sigmoid','rbf']:
    precision, recall,f1,accuracy=perform_kfold_validation(kernel=kernel,k=k)
    
    print('kernel used: ',kernel)
    print('mean overall accuracy =',np.mean(accuracy))
    print('mean precision= ',np.mean(precision),'mean precision per class',np.mean(precision,axis=0))
    print('mean recall   = ',np.mean(recall),'mean recall per class',np.mean(recall,axis=0))
    print('mean f1       = ',np.mean(f1),'mean f1 per class',np.mean(f1,axis=0))
print('Finished')

working on fold (with kernel= linear):0/10
working on fold (with kernel= linear):1/10
working on fold (with kernel= linear):2/10
working on fold (with kernel= linear):3/10
working on fold (with kernel= linear):4/10
working on fold (with kernel= linear):5/10
working on fold (with kernel= linear):6/10
working on fold (with kernel= linear):7/10
working on fold (with kernel= linear):8/10
working on fold (with kernel= linear):9/10
kernel used:  linear
mean overall accuracy = 0.5617100895337727
mean precision=  0.5562720770943074 mean precision per class [0.63482497 0.4806589  0.55333236]
mean recall   =  0.5282042904024576 mean recall per class [0.69268917 0.53987579 0.3520479 ]
mean f1       =  0.53334816300879 mean f1 per class [0.66218878 0.50806753 0.42978818]
working on fold (with kernel= poly):0/10


  'precision', 'predicted', average, warn_for)


working on fold (with kernel= poly):1/10
working on fold (with kernel= poly):2/10
working on fold (with kernel= poly):3/10
working on fold (with kernel= poly):4/10
working on fold (with kernel= poly):5/10
working on fold (with kernel= poly):6/10
working on fold (with kernel= poly):7/10
working on fold (with kernel= poly):8/10
working on fold (with kernel= poly):9/10
kernel used:  poly
mean overall accuracy = 0.4209501985650294
mean precision=  0.14031673285500978 mean precision per class [0.4209502 0.        0.       ]
mean recall   =  0.3333333333333333 mean recall per class [1. 0. 0.]
mean f1       =  0.19740499010596624 mean f1 per class [0.59221497 0.         0.        ]
working on fold (with kernel= sigmoid):0/10
working on fold (with kernel= sigmoid):1/10
working on fold (with kernel= sigmoid):2/10
working on fold (with kernel= sigmoid):3/10
working on fold (with kernel= sigmoid):4/10
working on fold (with kernel= sigmoid):5/10
working on fold (with kernel= sigmoid):6/10
working 