In [98]:
#Import every library and package needed for project

#import pandas lib for table data.
import pandas as pd

#Our bayes model
from sklearn.naive_bayes import MultinomialNB
#Our logistic regression model
from sklearn.linear_model import LogisticRegression
#Our support vector machine model
from sklearn import svm

#Converts text words into numbers for model training.
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import numpy as np
from sklearn.linear_model import LinearRegression

#import regex lib
import re


#Loading stemmed annotated data from phase 2.
annotatedData = pd.read_csv("testiranje.txt", sep='\t')
#annotatedData.head()

In [99]:
#Selecting only columns that matter for model training.
corpus = annotatedData[['QueryText' , 'CommentText']]

#y = pd.DataFrame({'SimilarityScor' : annotatedData['SimilarityScor']})
y = annotatedData[['SimilarityScor','NumericScore']]

query_vectorizer = CountVectorizer(ngram_range=(1,1)) # to use bigrams ngram_range=(2,2)
query_vectors = query_vectorizer.fit_transform(annotatedData['QueryText'])

comment_vectorizer = CountVectorizer(ngram_range=(1,1))
comment_vectors = comment_vectorizer.fit_transform(annotatedData['CommentText'])

comment_columns = comment_vectorizer.get_feature_names()

In [100]:
#returns number of words in string.
def countWords(s):
    return len(re.findall(r'\w+', s))

#find common words in two strings.
def findCommonWords(s1, s2):
      s1 = s1.lower()
      s2 = s2.lower()
      s1List = s1.split(" ")
      s2List = s2.split(" ")
      return len(list(set(s1List)&set(s2List)))

#makes new dataframe which is then used to train our models.
def processTestData(corpus, y) :
    
    allColumns = ['numWordsQuery','numWordsComment','numCommonWords','SimilarityScor','NumericScore']
    
    for name in comment_columns:
        allColumns.append(name)    
    
    temp = pd.DataFrame(columns = allColumns)
    numColumns = len(temp.columns)
        
    for i in range(0, len(corpus.index)):
        
        temp.loc[i, 'numWordsQuery'] = countWords(corpus.values[i][0])
        temp.loc[i, 'numWordsComment'] = countWords(corpus.values[i][1])
        temp.loc[i, 'numCommonWords'] = findCommonWords(corpus.values[i][0],corpus.values[i][1])
        temp.loc[i, 'SimilarityScor'] = y.values[i][0]
        temp.loc[i, 'NumericScore'] = y.values[i][1]
        
        commentText = [corpus.values[i][1]];
 
        vectorComment = comment_vectorizer.transform(commentText)
        vectorComment = vectorComment.toarray()
        
        for j in range(5, numColumns):
            #for k in range(0, 567):
              #  temp.loc[i, allColumns[j]] = vectorComment[0][k]
                temp.loc[i, allColumns[j]]  = 3
            
    return temp
    
#trainingData = processTestData(corpus,y)    

#returns new dataframe for test set
def processPredictData(corpus) :
    
    temp = pd.DataFrame(columns = ['numWordsQuery','numWordsComment','numCommonWords'])
        
    for i in range(0, len(corpus.index)):
        
        temp.loc[i, 'numWordsQuery'] = countWords(corpus.values[i][0])
        temp.loc[i, 'numWordsComment'] = countWords(corpus.values[i][1])
        temp.loc[i, 'numCommonWords'] = findCommonWords(corpus.values[i][0],corpus.values[i][1])
        

        
    return temp
    
trainingData = processTestData(corpus,y)    

X = trainingData[['numWordsQuery','numWordsComment','numCommonWords']]

y = trainingData['SimilarityScor']
y = y.astype('int')

trainingData.head()

#yRegression = trainingData['NumericScore']
#yRegression = yRegression.asType('int')

Unnamed: 0,numWordsQuery,numWordsComment,numCommonWords,SimilarityScor,NumericScore,200,204,30,3d,401,...,zamen,zamenjen,zaposlen,zastit,zastiti,zat,zatvor,zbir,zbog,zon
0,1,5,1,3,350,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1,1,14,0,2,240,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,1,7,1,3,370,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
3,1,12,1,2,239,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,1,7,0,2,220,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


In [4]:
#LOGISTICAL REGRESSION

#10 layer cross validation to train and validate logistical regression model with tuning hyper params.
logReg = LogisticRegression(penalty='l2',solver = 'liblinear', max_iter=100000)
p_grid_lr = {'classifier__C' : [0.1, 0.5, 1.2]}
p_logReg = Pipeline([('classifier', logReg)])
gs_logReg = GridSearchCV(estimator=p_logReg, param_grid=p_grid_lr, cv = 10, scoring='accuracy')

gs_logReg.fit(X,y)

acc = cross_val_score(gs_logReg, X, y, cv = 10)

print(acc)
print("Accuracy: ", acc.mean())

bestLRModel = gs_logReg.best_estimator_

[0.57142857 0.4        0.6        0.51428571 0.51428571 0.62857143
 0.54285714 0.45714286 0.54285714 0.58823529]
Accuracy:  0.5359663865546218


In [5]:
# 10 layer cross validation to train and validate SVM model.
svmModel = svm.SVC(kernel='linear')
#svmModel = svm.LinearSVC(penalty='l2', loss='squared_hinge', max_iter = 10000)

p_grid_svm = {'classifier__C' : [0.3, 0.5, 0.9]}
p_svm = Pipeline([('classifier', svmModel)])
gs_SVM = GridSearchCV(estimator=p_svm, param_grid=p_grid_svm, cv = 10, scoring='accuracy')

acc = cross_val_score(gs_SVM, X, y, cv = 10)

print(acc)
print("Accuracy: ", acc.mean())

gs_SVM.fit(X,y)

bestSVMModel = gs_SVM.best_estimator_

[0.45714286 0.34285714 0.51428571 0.57142857 0.51428571 0.6
 0.57142857 0.57142857 0.54285714 0.55882353]
Accuracy:  0.5244537815126049


In [26]:
#NAIVE BAYES 

#bayes model with 10 layer cross validation
bayes = MultinomialNB()

p_bayes = Pipeline([('classifier', bayes)])
acc = cross_val_score(p_bayes, X, y, cv = 10);

print(acc)
print("Accuracy: ", acc.mean())

p_bayes.fit(X,y)

[0.54285714 0.4        0.54285714 0.45714286 0.45714286 0.31428571
 0.48571429 0.48571429 0.45714286 0.47058824]
Accuracy:  0.4613445378151261


Pipeline(steps=[('classifier', MultinomialNB())])

In [45]:
#Predictions

testSet = pd.read_csv("predict.txt", sep='\t')

testSet = testSet[['QueryText' , 'CommentText']]

testData =  processPredictData(testSet)

#testData.head()


#t = testSet[]

#X.head()

#

#testData

predictedData = bestLRModel.predict(testData)
predictedData

#print(predictedData[0], predictedData[1], predictedData[2])

array([3, 0, 3])

In [52]:
#linear regression

linReg = LinearRegression()

#params = linReg.get_params().keys()
#params

#p_grid_lr = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
#p_linReg = Pipeline([('classifier', linReg)])
#gs_linReg = GridSearchCV(estimator=p_linReg, param_grid=p_grid_lr, cv = 10, scoring='accuracy')

#gs_linReg.fit(X,y)

#acc = cross_val_score(gs_linReg, X, y, cv = 10)

#print(acc)
#print("Accuracy: ", acc.mean())

#bestLRModel = gs_logReg.best_estimator_

linReg.fit(X, y)

testSet = pd.read_csv("predict.txt", sep='\t')

testSet = testSet[['QueryText' , 'CommentText']]

testData =  processPredictData(testSet)

#testData.head()


#t = testSet[]

#X.head()

#

#testData

predictedData = linReg.predict(testData)
predictedData

array([2.10700631, 1.27678253, 2.29788204])