<b>
  Stephen Hullender
</b>
<br/>
<span>
  CIS 4526 - Foundations in Machine Learning
  (Fall 2022)
</span>
<br/>
<span>
  Midterm Assignment: Paraphrase Identification
</span>

<br/>
<h3>Getting Started</h3>

In [1]:
# libraries
import pandas as pd
import numpy as np
import sys
np.set_printoptions(threshold=100)
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
#pd.set_option('display.max_rows', sys.maxsize)

# SKlearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, mean_absolute_error
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# scipy
from scipy import linalg
from scipy.stats import wasserstein_distance
import scipy.spatial as sp
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

# other 
from google.colab import drive, files
import pickle
import time
import re
import random
import os
import time
import calendar 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
# define source folder and text file names w/ labels needed for each table
SRC = "gdrive/My Drive/MLMidterm"
SRC_TRAIN = f"{SRC}/train_with_label.txt"
SRC_DEV = f"{SRC}/dev_with_label.txt"
SRC_TEST = f"{SRC}/test_without_label.txt"
LABELS = ["instance_id", "sentence_1", "sentence_2", "gold_label"]

# conversion of txt to DataFrame
def load_txt_to_pd():
  global train, dev, test
  train = pd.read_csv(SRC_TRAIN, delimiter="\t", names=LABELS, on_bad_lines='skip', encoding='utf-8')
  dev = pd.read_csv(SRC_DEV, delimiter="\t", names=LABELS, on_bad_lines='skip', encoding='utf-8')
  test = pd.read_csv(SRC_TEST, delimiter="\t", names=LABELS[:-1], on_bad_lines='skip', encoding='utf-8')
  # encoding: https://dev.to/_aadidev/3-ways-to-handle-non-utf-8-characters-in-pandas-242
  print("Text files have been converted to Pandas DataFrames...")

# connect to Google Drive to fetch files
try:  
  load_txt_to_pd()
except:
  drive.mount('/content/gdrive')
  load_txt_to_pd()

Mounted at /content/gdrive
Text files have been converted to Pandas DataFrames...


<h3>Data Cleaning and Preprocessing</h3>

In [3]:
# take out nan values from all tables
train = train.dropna().reset_index(drop=True)
dev = dev.dropna().reset_index(drop=True)
test = test.dropna().reset_index(drop=True)

# band-aids
test = test.drop([69])
test.reset_index(inplace=True)

In [4]:
# along with LABELS, make a TABLES array
TABLES = [train, dev, test]

gold_train = train['gold_label'].tolist()
gold_dev = dev['gold_label'].tolist()

# labels missing (showing text instead) on the following indexes:
training_label_band_aids = [
    319, 818, 904, 1007, 1076, 1089, 1132, 1258, 1408, 1476, 1546, 1664, 1735, 1789, 1857,
    2012, 2088, 2356, 2784, 2999, 3087, 3299, 3368, 3451, 3630, 3813
]
#corresponding_train_labels = [
#    1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1
#]
corresponding_train_labels = [0] * 26
for i in range(len(training_label_band_aids)):
  index = training_label_band_aids[i]
  gold_train[index] = str(corresponding_train_labels[i])

# another band aid
dev_label_band_aids = [77, 415, 602]
#corresponding_dev_labels = [1, 0, 1]
corresponding_dev_labels = [0] * 3
for i in range(len(dev_label_band_aids)):
  index = dev_label_band_aids[i]
  gold_dev[index] = str(corresponding_dev_labels[i])

In [5]:
contractions = { 
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it would",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "that'd": "that had",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there would",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we would",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you would",
  "you'd've": "you would have",
  "you'll": "you will",
  "you'll've": "you will have",
  "you're": "you are",
  "you've": "you have"
}

In [6]:
def regex(s):
  s = re.sub(r'https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE)
  s = re.sub(r'\<a href', ' ', s)
  s = re.sub(r'&amp;', '', s) 
  s = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', s)
  s = re.sub(r'<br />', ' ', s)
  s = re.sub(r'\'', ' ', s)
  return s;

def stops(s):
  s = word_tokenize(s)
  stops = set(stopwords.words('english'))
  words = []
  for word in s:
    if not word in stops:
      words.append(word)
  return " ".join(words)

def clean_text(sentence):
  sentence = sentence.lower()
  sentence = sentence.split()
  text = []
  for word in sentence:
    text.append(contractions[word] if word in contractions else word)
  sentence = " ".join(text)
  sentence = regex(sentence)
  sentence = stops(sentence)
  sentence = WordPunctTokenizer().tokenize(sentence)
  return sentence

In [7]:
# use the functions above to parse all words into separate elements in a list
# and take out punctuation and stopwords
for table in TABLES:
  # clean first column
  clean1 = list(map(clean_text, table['sentence_1']))
  table['new_sentence_1'] = clean1
  # clean second column
  clean2 = list(map(clean_text, table['sentence_2']))
  table['new_sentence_2'] = clean2

In [8]:
# lemmatize text, last step in cleaning words
lemmatizer = WordNetLemmatizer()

for table in TABLES:
  # lemmatize first column
  lemma1 = list(map(
    lambda word: list(map(lemmatizer.lemmatize, word)),
    table['new_sentence_1']
  ))
  table['new_sentence_1'] = lemma1
  # lemmatize second column
  lemma2 = list(map(
    lambda word: list(map(lemmatizer.lemmatize, word)),
    table['new_sentence_2']
  ))
  table['new_sentence_2'] = lemma2

In [9]:
# drop original text columns & reorder columns
train.drop(['sentence_1', 'sentence_2'], axis=1, inplace=True)
dev.drop(['sentence_1', 'sentence_2'], axis=1, inplace=True)
test.drop(['sentence_1', 'sentence_2'], axis=1, inplace=True)
new_cols = ['new_sentence_1', 'new_sentence_2', 'gold_label']
train = train.reindex(columns=new_cols)
dev = dev.reindex(columns=new_cols)
test = test.reindex(columns=new_cols[0:2])

In [None]:
train.head()

In [None]:
dev.head()

In [None]:
test.head()

In [13]:
n1 = train['new_sentence_1'] ; n2 = train['new_sentence_2']
nd1 = dev['new_sentence_1'] ; nd2 = dev['new_sentence_2']
nx1 = test['new_sentence_1'] ; nx2 = test['new_sentence_2']

In [14]:
# fetch length of each row of word
len_n1 = [len(n1[i]) for i in range(len(n1))]
len_n2 = [len(n2[i]) for i in range(len(n2))]
len_nd1 = [len(nd1[i]) for i in range(len(nd1))]
len_nd2 = [len(nd2[i]) for i in range(len(nd2))]
len_nx1 = [len(nx1[i]) for i in range(len(nx1))]
len_nx2 = [len(nx2[i]) for i in range(len(nx2))]

# calculate differences and average for each pair of columns
lenavg_train = [((len_n1[i] + len_n2[i]) / 2) for i in range(len(len_n1))]
lenavg_dev = [((len_nd1[i] + len_nd2[i]) / 2) for i in range(len(len_nd1))]
lenavg_test = [((len_nx1[i] + len_nx2[i]) / 2) for i in range(len(len_nx1))]
lendiff_train = [abs(len_n1[i] - len_n2[i]) / 2 for i in range(len(len_n1))]
lendiff_dev = [abs(len_nd1[i] - len_nd2[i]) for i in range(len(len_nd1))]
lendiff_test = [abs(len_nx1[i] - len_nx2[i]) for i in range(len(len_nx1))]

In [15]:
# separate TfidfVectorizer's for each array/Series of text data
tfidf_n1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)
tfidf_n2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)
tfidf_nd1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)
tfidf_nd2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)
tfidf_nx1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)
tfidf_nx2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=" ".join, stop_words='english', lowercase=False)

In [None]:
# matrices of numbers from TfidfVectorizer
arr_n1 = tfidf_n1.fit_transform(n1).toarray()
arr_n2 = tfidf_n2.fit_transform(n2).toarray()
arr_nd1 = tfidf_nd1.fit_transform(nd1).toarray()
arr_nd2 = tfidf_nd2.fit_transform(nd2).toarray()
arr_nx1 = tfidf_nx1.fit_transform(nx1).toarray()
arr_nx2 = tfidf_nx2.fit_transform(nx2).toarray()

print(len(arr_n1)); print(len(arr_nd1)); print(len(arr_nx1))

In [17]:
# add x number of arrays of same length to accomodate for missing words
# get words that are available in one set but not another
train_1_not_2 = list(set(tfidf_n1.get_feature_names_out()).difference(tfidf_n2.get_feature_names_out()))
train_2_not_1 = list(set(tfidf_n2.get_feature_names_out()).difference(tfidf_n1.get_feature_names_out()))
dev_1_not_2 = list(set(tfidf_nd1.get_feature_names_out()).difference(tfidf_nd2.get_feature_names_out()))
dev_2_not_1 = list(set(tfidf_nd2.get_feature_names_out()).difference(tfidf_nd1.get_feature_names_out()))
test_1_not_2 = list(set(tfidf_nx1.get_feature_names_out()).difference(tfidf_nx2.get_feature_names_out()))
test_2_not_1 = list(set(tfidf_nx2.get_feature_names_out()).difference(tfidf_nx1.get_feature_names_out()))

In [18]:
# take each existing matrix (2 cells above) and add empty rows

# change feature names to fit all words
new_arr_n1 = []
for i in range(len(arr_n1)):
    temp = list(arr_n1[i])
    temp.extend([0] * len(train_2_not_1))
    new_arr_n1.append(temp)
arr_n1 = np.array(new_arr_n1)

new_arr_n2 = []
for i in range(len(arr_n2)):
    temp = list(arr_n2[i])
    temp.extend([0] * len(train_1_not_2))
    new_arr_n2.append(temp)
arr_n2 = np.array(new_arr_n2)

new_arr_nx1 = []
for i in range(len(arr_nx1)):
    temp = list(arr_nx1[i])
    temp.extend([0] * len(test_2_not_1))
    new_arr_nx1.append(temp)
arr_nx1 = np.array(new_arr_nx1)

new_arr_nx2 = []
for i in range(len(arr_nx2)):
    temp = list(arr_nx2[i])
    temp.extend([0] * len(test_1_not_2))
    new_arr_nx2.append(temp)
arr_nx2 = np.array(new_arr_nx2)

new_arr_nd1 = []
for i in range(len(arr_nd1)):
    temp = list(arr_nd1[i])
    temp.extend([0] * len(dev_2_not_1))
    new_arr_nd1.append(temp)
arr_nd1 = np.array(new_arr_nd1)

new_arr_nd2 = []
for i in range(len(arr_nd2)):
    temp = list(arr_nd2[i])
    temp.extend([0] * len(dev_1_not_2))
    new_arr_nd2.append(temp)
arr_nd2 = np.array(new_arr_nd2)

In [19]:
# conjoin features
features_train = list(tfidf_n1.get_feature_names_out())
extra_train = list(train_2_not_1)
features_train.extend(extra_train)

features_dev = list(tfidf_nd1.get_feature_names_out())
extra_dev = list(dev_2_not_1)
features_dev.extend(extra_dev)

features_test = list(tfidf_nx1.get_feature_names_out())
extra_test = list(test_2_not_1)
features_test.extend(extra_test)

# all lengths are to be equal to set.union

In [20]:
# process separate DataFrames for each table (TFIDF)
def process_tfidf(table, values):
  return pd.DataFrame(
      data=table.T, index=values, columns=([f'tfidf_{i}' for i in range(len(table))])
  )

In [None]:
# for training column 1
df_n1 = process_tfidf(arr_n1, features_train)
# for training column 2
df_n2 = process_tfidf(arr_n2, features_train)
# for dev column 1
df_nd1 = process_tfidf(arr_nd1, features_dev)
# for dev column 2
df_nd2 = process_tfidf(arr_nd2, features_dev)
# for test column 1
df_nx1 = process_tfidf(arr_nx1, features_test)
# for test column 2
df_nx2 = process_tfidf(arr_nx2, features_test)
# example
df_n1

In [22]:
# test cosine similarity of two matrices using scipy

def calculate_cosine(n1, n2):
  cos = []
  for i in range(len(n1)):
    temp1, temp2 = n1[i], n2[i]
    c = sp.distance.cosine(temp1, temp2)
    cos.append(c)
  return cos

cosine_results_train = calculate_cosine(arr_n1, arr_n2)
cosine_results_dev = calculate_cosine(arr_nd1, arr_nd2)
cosine_results_test = calculate_cosine(arr_nx1, arr_nx2)

# HOW IT WORKS: 
# sp.distance.cosine(num[], num[]) -> int
# input1 <- first num[]
# input2 <- second num[]
# return (
#   1 - (
#     input1 (dot) input2 
#     /                  
#     np.sqrt( add x^2 for x in input1 ) x np.sqrt( add x^2 for x in input2 )
#   )
# )
# MORE: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine

In [23]:
# save text_similarities in pickle file since they take too long
pickle_ts1 = f'{SRC}/text_similarities_train.pickle'
pickle_ts2 = f'{SRC}/text_similarities_dev.pickle'
pickle_ts3 = f'{SRC}/text_similarities_test.pickle'
pickle_cs1 = f'{SRC}/compare_train.pickle'
pickle_cs2 = f'{SRC}/compare_dev.pickle'
pickle_cs3 = f'{SRC}/compare_test.pickle'

if not os.path.exists(pickle_ts1):
  open(pickle_ts1, 'wb').close()
if not os.path.exists(pickle_ts2):
  open(pickle_ts2, 'wb').close()
if not os.path.exists(pickle_ts3):
  open(pickle_ts3, 'wb').close()
if not os.path.exists(pickle_cs1):
  open(pickle_cs1, 'wb').close()
if not os.path.exists(pickle_cs2):
  open(pickle_cs2, 'wb').close()
if not os.path.exists(pickle_cs3):
  open(pickle_cs3, 'wb').close()

In [24]:
# try gensim
# source: https://betterprogramming.pub/introduction-to-gensim-calculating-text-similarity-9e8b55de342d
from gensim import corpora, models, similarities

text_similarities_train = []
if not os.path.getsize(pickle_ts1) > 0:
  for IND in range(len(arr_n1)):
    #start = 0 if IND < len(arr_n1)-500 else len(arr_n1)-500 
    end = IND+500
    if (end > len(arr_n1)):
      end = len(arr_n1)
    text_a = n1[0:end]     # require array for dictionary
    text_b = n2[IND]
    dictionary = corpora.Dictionary(text_a)
    corpus = [dictionary.doc2bow(t) for t in text_a]  # put each individual entry in doc2bow
    tfidf = models.TfidfModel(corpus)
    kw_vector = dictionary.doc2bow(text_b)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.token2id))
    sim = index[tfidf[kw_vector]]
    text_similarities_train.append(sim[IND])
  with open(pickle_ts1, 'wb') as writefile:
    pickle.dump(text_similarities_train, writefile)
    writefile.close()
else:
  readfile = open(pickle_ts1, 'rb')
  text_similarities_train = pickle.load(readfile)
  readfile.close()


text_similarities_dev = []
if not os.path.getsize(pickle_ts2) > 0:
  for IND in range(len(arr_nd1)):
    end = IND+500
    if (end > len(arr_nd1)):
      end = len(arr_nd1)
    text_a = nd1[0:end]    
    text_b = nd2[IND]
    dictionary = corpora.Dictionary(text_a)
    corpus = [dictionary.doc2bow(t) for t in text_a] 
    tfidf = models.TfidfModel(corpus)
    kw_vector = dictionary.doc2bow(text_b)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.token2id))
    sim = index[tfidf[kw_vector]]
    text_similarities_dev.append(sim[IND])
  with open(pickle_ts2, 'wb') as writefile:
    pickle.dump(text_similarities_train, writefile)
    writefile.close()
else:
  readfile = open(pickle_ts2, 'rb')
  text_similarities_dev = pickle.load(readfile)
  readfile.close()


text_similarities_test = []
if not os.path.getsize(pickle_ts3) > 0:
  for IND in range(len(arr_nx1)):
    end = IND+500
    if (end > len(arr_nx1)):
      end = len(arr_nx1)
    text_a = nx1[0:end] 
    text_b = nx2[IND]
    dictionary = corpora.Dictionary(text_a)
    corpus = [dictionary.doc2bow(t) for t in text_a] 
    tfidf = models.TfidfModel(corpus)
    kw_vector = dictionary.doc2bow(text_b)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.token2id))
    sim = index[tfidf[kw_vector]]
    text_similarities_test.append(sim[IND])
  with open(pickle_ts3, 'wb') as writefile:
    pickle.dump(text_similarities_train, writefile)
    writefile.close()
else:
  readfile = open(pickle_ts3, 'rb')
  text_similarities_test = pickle.load(readfile)
  readfile.close()

<h3>
  Using Algorithms
</h3>

In [25]:
def print_results(a, p):
  lbl = ['accuracy', 'precision', 'recall', 'f1']

  ind = ['weighted', 'micro', 'macro']
  acc = []
  pre = []
  rec = []
  ff = []

  acc_score = accuracy_score(a, p)
  for i in range(len(ind)):
    acc.append(acc_score)
    pre.append(
        precision_score(a, p, average=ind[i])
    )
    rec.append(
        recall_score(a, p, average=ind[i])
    )
    ff.append(
        f1_score(a, p, average=ind[i])
    )

  dta = [acc, pre, rec, ff]
  df = pd.DataFrame(dta, columns=ind, index=lbl)

  ts = str(calendar.timegm(time.gmtime()))
  filename = f'{SRC}/results-{ts}.txt'

  results = pd.DataFrame()
  with open(filename, 'a') as f:
    dfAsString = df.to_string(header=True, index=True)
    f.write(dfAsString + "\n")
    f.close()

  print(' ::: weighted, micro, macro :::')
  print('ACCURACY: ', acc_score)
  print('PRECISION: ', pre)
  print('RECALL: ', rec)
  print('F1-SCORE: ', ff)

In [27]:
# before performing algorithms, we want to know whether the following influences the decision of accurately predicting the gold label:
# - cosine similarity
# - similarity based on sparse matrix
# - average length in pair of text
# - difference in length of pair of text
# - & for train/dev comparisons, add gold_label

CL = ['cosine', 'similarity', 'length_average', 'length_difference', 'label']

compare_data_train = []
if not os.path.getsize(pickle_cs1) > 0:
  for i in range(len(gold_train)):
    temp = (cosine_results_train[i], text_similarities_train[i], lenavg_train[i], lendiff_train[i], gold_train[i])
    compare_data_train.append(temp)
  with open(pickle_cs1, 'wb') as writefile:
    pickle.dump(compare_data_train, writefile)
    writefile.close()
else:
  readfile = open(pickle_cs1, 'rb')
  compare_data_train = pickle.load(readfile)
  readfile.close()
compare_train = pd.DataFrame(data=compare_data_train, columns=CL)


compare_data_dev = []
if not os.path.getsize(pickle_cs2) > 0:
  for i in range(len(gold_dev)):
    temp = (cosine_results_dev[i], text_similarities_dev[i], lenavg_dev[i], lendiff_dev[i], gold_dev[i])
    compare_data_dev.append(temp)
  with open(pickle_cs2, 'wb') as writefile:
    pickle.dump(compare_data_dev, writefile)
    writefile.close()
else:
  readfile = open(pickle_cs2, 'rb')
  compare_data_dev = pickle.load(readfile)
  readfile.close()
compare_dev = pd.DataFrame(data=compare_data_dev, columns=CL)


compare_data_test = []
if not os.path.getsize(pickle_cs3) > 0:
  for i in range(len(cosine_results_test)):
    temp = (cosine_results_test[i], text_similarities_test[i], lenavg_test[i], lendiff_test[i])
    compare_data_test.append(temp)
  with open(pickle_cs3, 'wb') as writefile:
    pickle.dump(compare_data_test, writefile)
    writefile.close()
else:
  readfile = open(pickle_cs3, 'rb')
  compare_data_test = pickle.load(readfile)
  readfile.close()
compare_test = pd.DataFrame(data=compare_data_test, columns=CL[0:4])

In [28]:
# logistic regression (condition: estimate label based on cosine AND gensim similarlity)
x = compare_train.drop('label', axis=1)
y = compare_train['label']

lr_1 = LogisticRegression()

# 1 -> training data for training    -> matrix
# 2 -> data for prediction (testing) -> matrix
# 3 -> label column for training                          -> series
# 4 -> data for comparing prediction results (testing)    -> series
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.3, random_state=0)

In [None]:
# start with training and predict via training data (split)
lr_1.fit(x_train, y_train)
preds_1 = lr_1.predict(x_test)

print_results(y_test, preds_1)

In [None]:
# next, try with training and predict via dev data
a = compare_dev.drop('label', axis=1)
b = compare_dev['label']

# x, y, a, b
lr_1.fit(x.values, y.values)
preds_2 = lr_1.predict(a.values)

print_results(b.values, preds_2)

In [None]:
# use same logistic regression for predicting gold labels
final_preds = lr_1.predict(compare_data_test)
print(final_preds)

<h3>
  Sources
</h3>

<span>
  External Links
</span>
<br/>
<ul>
<li><a href="https://github.com/stephull/MLMidTerm" target='_blank'>GitHub (Forked Repo)</a></li>
</ul>