In [1]:
#### Import Data #### 

import pandas as pd 
import numpy as np 
from sklearn import preprocessing 

# Import training set 
with open('train_with_label.txt') as file:
    train_buffer = file.readlines()
    train = pd.DataFrame([row.split('\t') for row in train_buffer], columns=['ID', 'sentence1', 'sentence2', 'groundTruth'])
    train.groundTruth = train.groundTruth.apply(lambda x: int(x.rstrip()))

# Import dev set 
with open('dev_with_label.txt') as file:
    dev_buffer = file.readlines()
    dev = pd.DataFrame([row.split('\t') for row in dev_buffer], columns=['ID', 'sentence1', 'sentence2', 'groundTruth'])
    dev.groundTruth = dev.groundTruth.apply(lambda x: int(x.rstrip()))

# Import testing set 
with open('test_without_label.txt') as file:
    test_buffer = file.readlines()
    test = pd.DataFrame([row.split('\t') for row in test_buffer], columns=['ID', 'sentence1', 'sentence2']) 

    
# Print out shapes 
print("Train With Label Shape: " + str(train.shape)) 
print("Dev With Label Shape: " + str(dev.shape)) 
print("Test Without Label Shape: " + str(test.shape)) 


Train With Label Shape: (7801, 4)
Dev With Label Shape: (4000, 4)
Test Without Label Shape: (4000, 3)


In [2]:
#### Generate Features #### 

import nltk 
from Levenshtein import distance as lev 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.corpus import wordnet 
from nltk.translate.bleu_score import SmoothingFunction 


def commonSynonyms(wordSet1, wordSet2): 
    commonSynonyms = 0 
    for word1 in wordSet1: # For every word in set 1  
        synonyms = [] # Get synonyms for this word
        for syn in wordnet.synsets(word1): 
            for lem in syn.lemmas(): 
                synonyms.append(lem.name()) 
        # For each word in set 2 
        for word2 in wordSet2: 
            if word2 in synonyms: 
                commonSynonyms = commonSynonyms + 1 
    return commonSynonyms 

# Function: Generate features provided two sentences 
def featureCreation(frame): # Pass in the information 
    df = pd.DataFrame(frame) # Convert to dataFrame 
     
    # Vectorize each sentence and clean each element 
    token = WhitespaceTokenizer() 
    df['words1'] = df['sentence1'].str.replace('[^\'0-9A-Za-z ]+', '').str.lower().str.replace(" '", "'").str.replace("  ", " ").str.split(' ') 
    df['words2'] = df['sentence2'].str.replace('[^\'0-9A-Za-z ]+', '').str.lower().str.replace(" '", "'").str.replace("  ", " ").str.split(' ') 
    df['words1'] = df.apply(lambda x: (list(filter(lambda item: item, x['words1']))), axis = 1) 
    df['words2'] = df.apply(lambda x: (list(filter(lambda item: item, x['words2']))), axis = 1) 
    
    # Lemmatize each word 
    lemmatizer = WordNetLemmatizer() 
    df['words1'] = df.apply(lambda x: ([lemmatizer.lemmatize(word) for word in x['words1']]), axis = 1) 
    df['words2'] = df.apply(lambda x: ([lemmatizer.lemmatize(word) for word in x['words2']]), axis = 1) 

    # Word stemmer: 
    ps = PorterStemmer() # Also try this later 
    snow_stemmer = SnowballStemmer(language='english') 
    df['words1'] = df.apply(lambda x: ([snow_stemmer.stem(word) for word in x['words1']]), axis = 1) 
    df['words2'] = df.apply(lambda x: ([snow_stemmer.stem(word) for word in x['words2']]), axis = 1) 
 

    df['CommonSynonymCount'] = df.apply(lambda x: commonSynonyms(x['words1'], x['words2']), axis = 1)
    df['WordsInCommon'] = df.apply(lambda x: (len(np.intersect1d(x['words1'],x['words2']))), axis = 1) 
    
    # Find the number of words in common, then divide by the average number of words between the two sentences 
    df['WordsInCommon/AverageLen'] = df.apply(lambda x: (len(np.intersect1d(x['words1'],x['words2']))) / (len(x['words1']) + len(x['words2']))/2, axis = 1)
    
    # Obtain the difference in length between the two sentences 
    df['differenceInLength'] = df.apply(lambda x: abs(len(x['words1']) - len(x['words2'])), axis = 1)

    # BiLingual Evaluation Understudy Score 
    df['Bleu Score'] = df.apply(lambda x: nltk.translate.bleu_score.sentence_bleu([x['words1']], x['words2'], auto_reweigh = True), axis = 1) 

    # Levenshtein Score 
    df['Levenshtein'] = 0
    for i in range(len(df)): 
        df['Levenshtein'].iloc[i] = lev(df['words1'].iloc[i], df['words2'].iloc[i]) / ((len(df['words1'].iloc[i]) + len(df['words2'].iloc[i])) / 2) 

    return df 

In [3]:
import matplotlib.pyplot as plt 
from sklearn import linear_model 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.utils import resample 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score 
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance 
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import TfidfVectorizer 

# Generate the features the datasets: 
train = featureCreation(train) 
dev = featureCreation(dev) 
test = featureCreation(test) 

# # Upsampling: 
print("Number of zeros in the training set: " + str((train['groundTruth'] == 0).sum())) 

# # Thus, an upsampling must be preformed: 
zeros = train[train['groundTruth'] == 0] 
ones = train[train['groundTruth'] == 1] 
ones = resample(ones, replace = True, n_samples = len(zeros), random_state = 1)
train = pd.concat([zeros, ones]) 
train = train.reset_index() 

print("New number of zeros in the training set: " + str((train['groundTruth'] == 0).sum())) 
print("New number of ones in the training set: " + str((train['groundTruth'] == 1).sum())) 

# Set features that will be used in the model:  
X_train = train[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] # Training X 
X_dev = dev[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] # Dev X 
y_train = train['groundTruth'] # Training Y 
y_dev = dev['groundTruth'] # Dev Y 
# What you will run the program on: 
X_test = test[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] # Final testing X 

# Normalize the data: 
scaler = MinMaxScaler() 
X_train[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] = scaler.fit_transform(X_train[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']]) 
X_dev[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] = scaler.transform(X_dev[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']]) 
X_test[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']] = scaler.transform(X_test[['WordsInCommon/AverageLen', 'Bleu Score', 'Levenshtein', 'differenceInLength', 'WordsInCommon/AverageLen', 'CommonSynonymCount', 'WordsInCommon']]) 


  df['words1'] = df['sentence1'].str.replace('[^\'0-9A-Za-z ]+', '').str.lower().str.replace(" '", "'").str.replace("  ", " ").str.split(' ')
  df['words2'] = df['sentence2'].str.replace('[^\'0-9A-Za-z ]+', '').str.lower().str.replace(" '", "'").str.replace("  ", " ").str.split(' ')
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
A value is trying to be set on a copy of 

Number of zeros in the training set: 5901
New number of zeros in the training set: 5901
New number of ones in the training set: 5901


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[iloc] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[iloc] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[iloc] = igetitem(value, i)


In [4]:
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import f1_score

# Apply MLP Classifier 

model = MLPClassifier(random_state = 1, max_iter = 400, learning_rate = 'adaptive').fit(X_train, y_train) 

prediction = model.predict(X_dev) 
print(f1_score(prediction, y_dev)) 


0.8169491525423729


In [5]:
# Project Submission: 
y_pred = model.predict(X_test) 
test['y_pred'] = y_pred.tolist() 
submission = test[['ID', 'y_pred']] 
submission.to_csv('Will_Schenk_test_result.txt', sep ='\t', header = False, index = False)
