In [32]:
import numpy as np
import pandas as pd
from gensim.models import word2vec
import re
from nltk.corpus import stopwords

In [33]:
model = word2vec.Word2Vec.load('data/quora/word2vec_full_data_wo_stopwords')

In [64]:
#cosine similarity
def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return float(np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2))))

In [48]:
def question_to_wordlist( question, remove_stopwords=True ):
    question = re.sub("[^a-zA-Z]"," ", str(question))
    words = question.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [36]:
data_train = pd.read_csv('data/quora/train.csv')
data_test = pd.read_csv('data/quora/test.csv')

In [43]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0. 
    for word in words:
        if word in model: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(questions, model, num_features):
    counter = 0.
    questionFeatureVecs = np.zeros((len(questions),num_features),dtype="float32")
    for question in questions:
        words = question_to_wordlist(question)
        if counter%100000. == 0.:
            print "Review %d of %d" % (counter, len(questions))
        questionFeatureVecs[counter] = makeFeatureVec(words, model, num_features)
        counter = counter + 1.
    return questionFeatureVecs

In [52]:
train_qn1_avg_vector = getAvgFeatureVecs(data_train['question1'], model, 300)

Review 0 of 404290




Review 100000 of 404290
Review 200000 of 404290
Review 300000 of 404290
Review 400000 of 404290


In [49]:
train_qn2_avg_vector = getAvgFeatureVecs(data_train['question2'], model, 300)

Review 0 of 404290




Review 100000 of 404290
Review 200000 of 404290
Review 300000 of 404290
Review 400000 of 404290


In [65]:
cosine_sim = []
for x,y in zip(train_qn1_avg_vector, train_qn2_avg_vector):
    cosine_sim.append(cosine(x , y))

In [72]:
import pickle
cosine_sim_df= pd.DataFrame(list(cosine_sim), columns=['cosine_sim'])
with open('data/quora/train_cosine_sim.pickle','w') as f:
    pickle.dump(cosine_sim_df,f)

In [71]:
cosine_sim_df.head()

Unnamed: 0,cosine_sim
0,0.961703
1,0.747565
2,0.704237
3,0.09308
4,0.579749


In [73]:
test_qn1_avg_vector = getAvgFeatureVecs(data_test['question1'], model, 300)
test_qn2_avg_vector = getAvgFeatureVecs(data_test['question2'], model, 300)

Review 0 of 2345796




Review 100000 of 2345796
Review 200000 of 2345796
Review 300000 of 2345796
Review 400000 of 2345796
Review 500000 of 2345796
Review 600000 of 2345796
Review 700000 of 2345796
Review 800000 of 2345796
Review 900000 of 2345796
Review 1000000 of 2345796
Review 1100000 of 2345796
Review 1200000 of 2345796
Review 1300000 of 2345796
Review 1400000 of 2345796
Review 1500000 of 2345796
Review 1600000 of 2345796
Review 1700000 of 2345796
Review 1800000 of 2345796
Review 1900000 of 2345796
Review 2000000 of 2345796
Review 2100000 of 2345796
Review 2200000 of 2345796
Review 2300000 of 2345796
Review 0 of 2345796
Review 100000 of 2345796
Review 200000 of 2345796
Review 300000 of 2345796
Review 400000 of 2345796
Review 500000 of 2345796
Review 600000 of 2345796
Review 700000 of 2345796
Review 800000 of 2345796
Review 900000 of 2345796
Review 1000000 of 2345796
Review 1100000 of 2345796
Review 1200000 of 2345796
Review 1300000 of 2345796
Review 1400000 of 2345796
Review 1500000 of 2345796
Review 160

In [74]:
test_cosine_sim = []
for x,y in zip(test_qn1_avg_vector, test_qn2_avg_vector):
    test_cosine_sim.append(cosine(x , y))
test_cosine_sim_df= pd.DataFrame(list(test_cosine_sim), columns=['cosine_sim'])
with open('data/quora/test_cosine_sim.pickle','w') as f:
    pickle.dump(test_cosine_sim_df,f)  