In [1]:
#Import every library and package needed for project

#import pandas lib for table data.
import pandas as pd

#Our bayes model
from sklearn.naive_bayes import MultinomialNB
#Our logistic regression model
from sklearn.linear_model import LogisticRegression
#Our support vector machine model
from sklearn import svm

#Converts text words into numbers for model training.
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import numpy as np
from sklearn.linear_model import LinearRegression

#import regex lib
import re


annotatedData = pd.read_csv("data.txt", sep='\t')

In [2]:
#Selecting only columns that matter for model training.
corpus = annotatedData[['QueryText' , 'CommentText']]

y = pd.DataFrame({'SimilarityScor' : annotatedData['SimilarityScor']})
#y = annotatedData['SimilarityScor']

query_vectorizer = CountVectorizer(ngram_range=(1,1)) # to use bigrams ngram_range=(2,2)
query_vectors = query_vectorizer.fit_transform(annotatedData['QueryText'])

comment_vectorizer = CountVectorizer(ngram_range=(1,1))
comment_vectors = comment_vectorizer.fit_transform(annotatedData['CommentText'])

comment_columns = comment_vectorizer.get_feature_names()
query_columns = query_vectorizer.get_feature_names()

In [3]:
#returns number of words in string.
def countWords(s):
    return len(re.findall(r'\w+', s))

#find common words in two strings.
def findCommonWords(s1, s2):
      s1 = s1.lower()
      s2 = s2.lower()
      s1List = s1.split(" ")
      s2List = s2.split(" ")
      return len(list(set(s1List)&set(s2List)))
    

#makes new dataframe which is then used to train our models.
def processData(corpus, y) :
    
    allColumns = ['numWordsQuery','numWordsComment','numCommonWords','SimilarityScor']
    
    for name in comment_columns:
        allColumns.append(name)      
         
    d = {}
    
    for column in allColumns:
        d[column] = []
        
    for index,row in corpus.iterrows():
        
        d['numWordsQuery'].append(countWords(row['QueryText']))
        d['numWordsComment'].append(countWords(row['CommentText']))
        d['numCommonWords'].append(findCommonWords(row['QueryText'],row['CommentText']))
        d['SimilarityScor'].append(y.iloc[index]['SimilarityScor'])
        #d['NumericScore'].append(y.iloc[index]['NumericScore'])
    
        vectorComment = comment_vectorizer.transform([row['CommentText']])
        vectorComment = vectorComment.toarray()
    
        #for i, commentCol in enumerate(allColumns[5:572]):
        for i, commentCol in enumerate(allColumns[4:]):
            d[commentCol].append(vectorComment[0][i])
        
    return pd.DataFrame.from_dict(d)
    


def processPredictionData(corpus) :
    
    allColumns = ['numWordsQuery','numWordsComment','numCommonWords']
    
    for name in comment_columns:
        allColumns.append(name)        
         
    d = {}
    
    for column in allColumns:
        d[column] = []
        
    for index,row in corpus.iterrows():
        
        d['numWordsQuery'].append(countWords(row['QueryText']))
        d['numWordsComment'].append(countWords(row['CommentText']))
        d['numCommonWords'].append(findCommonWords(row['QueryText'],row['CommentText']))

        vectorComment = comment_vectorizer.transform([row['CommentText']])
        vectorComment = vectorComment.toarray()
        
        vectorQuery = query_vectorizer.transform([row['QueryText']])
        vectorQuery = vectorQuery.toarray()
    
        for i, commentCol in enumerate(allColumns[3:570]):
            d[commentCol].append(vectorComment[0][i])
     
     
    return pd.DataFrame.from_dict(d)
                                 
                                 
                                 
trainingData = processData(corpus,y)    

y = trainingData['SimilarityScor']
y = y.astype('int')

#z = trainingData['NumericScore']
#z = z.astype('int')

#X = trainingData.drop(['NumericScore','SimilarityScor'], 1)
X = trainingData.drop('SimilarityScor', 1)


x_train, x_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

In [41]:
#LOGISTICAL REGRESSION

#10 layer cross validation to train and validate logistical regression model with tuning hyper params.
logReg = LogisticRegression(penalty='l2',solver = 'liblinear', max_iter=150000)
p_grid_lr = {'classifier__C' : [0.1, 0.5, 1.2]}
p_logReg = Pipeline([('classifier', logReg)])
gs_logReg = GridSearchCV(estimator=p_logReg, param_grid=p_grid_lr, cv = 10, scoring='accuracy')

gs_logReg.fit(x_train,y_train)

acc = cross_val_score(gs_logReg, x_train, y_train, cv = 10)

print(acc)
print("Accuracy: ", acc.mean())

bestLRModel = gs_logReg.best_estimator_

[0.57407407 0.66666667 0.53703704 0.51851852 0.62962963 0.66666667
 0.62962963 0.55555556 0.59259259 0.62264151]
Accuracy:  0.5993011879804333


In [42]:
# 10 layer cross validation to train and validate SVM model.
svmModel = svm.SVC(kernel='linear')
#svmModel = svm.LinearSVC(penalty='l2', loss='squared_hinge', max_iter = 10000)

p_grid_svm = {'classifier__C' : [0.3, 0.5, 0.9]}
p_svm = Pipeline([('classifier', svmModel)])
gs_SVM = GridSearchCV(estimator=p_svm, param_grid=p_grid_svm, cv = 10, scoring='accuracy')

acc = cross_val_score(gs_SVM, x_train, y_train, cv = 10)

print(acc)
print("Accuracy: ", acc.mean())

gs_SVM.fit(X,y)

bestSVMModel = gs_SVM.best_estimator_

[0.59259259 0.7037037  0.57407407 0.55555556 0.66666667 0.66666667
 0.64814815 0.55555556 0.53703704 0.64150943]
Accuracy:  0.6141509433962266


In [43]:
#NAIVE BAYES 

#bayes model with 10 layer cross validation

skf = StratifiedKFold(n_splits=10)
params = {}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)

p_bayes = Pipeline([('classifier', nb)])
acc = cross_val_score(p_bayes, X, y, cv = 10);

print(acc)
print("Accuracy: ", acc.mean())

gs.fit(X,y)

bestSVMModel = gs.best_estimator_

[0.44117647 0.52941176 0.30882353 0.42647059 0.43283582 0.41791045
 0.47761194 0.40298507 0.43283582 0.47761194]
Accuracy:  0.4347673397717296


In [33]:
#Predictions

testSet = pd.read_csv("predict.txt", sep='\t')

testSet = testSet[['QueryText' , 'CommentText']]

testData =  processPredictionData(testSet)

predictedData = bestLRModel.predict(testData)
predictedData

ValueError: arrays must all be same length

In [52]:
#linear regression

linReg = LinearRegression()

#params = linReg.get_params().keys()
#params

#p_grid_lr = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
#p_linReg = Pipeline([('classifier', linReg)])
#gs_linReg = GridSearchCV(estimator=p_linReg, param_grid=p_grid_lr, cv = 10, scoring='accuracy')

#gs_linReg.fit(X,y)

#acc = cross_val_score(gs_linReg, X, y, cv = 10)

#print(acc)
#print("Accuracy: ", acc.mean())

#bestLRModel = gs_logReg.best_estimator_

linReg.fit(X, y)

testSet = pd.read_csv("predict.txt", sep='\t')

testSet = testSet[['QueryText' , 'CommentText']]

testData =  processPredictData(testSet)

#testData.head()


#t = testSet[]

#X.head()

#

#testData

#predictedData = linReg.predict(testData)
#predictedData

array([2.10700631, 1.27678253, 2.29788204])