In [96]:
import pandas as pd
import numpy as np
from sklearn import tree
import csv

In [97]:
base_dir = './Quora_question_pair_partition/'
train_dir = base_dir + 'train.tsv'
dev_dir = base_dir + 'dev.tsv'
test_dir = base_dir + 'test.tsv'
word_dir = base_dir + 'wordvec.txt'
words_vec = {}

In [98]:
def loadWordDic():
    with open(word_dir, 'r') as words:
        tweets_reader = csv.reader(words, delimiter=' ')
        cnt = 0
        for row in tweets_reader:
            words_vec[row[0]] = np.array([float(x) for x in row[1:]]).reshape((1,300))
            cnt += 1
            if cnt%5000 == 0:
                print(cnt)
loadWordDic()

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000


In [99]:
# read data
def loadData():
    train_df = pd.read_csv(train_dir,sep='\t', header=None, names=['label','q1','q2','id'])
    train_X, train_y = train_df[['q1','q2']], train_df['label']
    dev_df = pd.read_csv(dev_dir,sep='\t', header=None, names=['label','q1','q2','id'])
    dev_X, dev_y = dev_df[['q1','q2']], dev_df['label']
    test_df = pd.read_csv(test_dir,sep='\t', header=None, names=['label','q1','q2','id'])
    test_X, test_y = test_df[['q1','q2']], test_df['label']
    return train_X, train_y, dev_X, dev_y, test_X, test_y

In [100]:
def generate_features(df):
    feature_names = []
    
    # number of words
    df['cnt'] = df.apply(lambda row:abs(len(row['q1'].split(' '))-len(row['q2'].split(' '))),axis=1)
    feature_names.append('cnt')
    
    # distance between sentences
    df['dis'] = df.apply(dis, axis = 1)
    feature_names.append('dis')
    
    # magic feature: question frequency
    
    print("Finish generating features")
    
    return feature_names, df

In [101]:
## train model
def train(train_X, train_y,para):
    
    feature_names, df = generate_features(train_X)
    
    clf = tree.DecisionTreeClassifier(max_depth=para['max_depth'])
    
    clf.fit(df[feature_names],train_y)
    
    acc = clf.score(df[feature_names],train_y)
    
    print("Accuracy on the training set:", acc)
    
    return acc, clf

In [102]:
## select hyper parameters on the dev set
def select_model(train_X, train_y, dev_X, dev_y, paras):
    max_acc = 0
    best_model = None
    for para in paras:
        train_acc, model = train(train_X, train_y, para)
        feature_names, df = generate_features(dev_X)
        dev_acc = model.score(df[feature_names], dev_y)
        if dev_acc > max_acc:
            max_acc = dev_acc
            best_model = model
    return max_acc, model

In [103]:
## evaluate on the test set
def test(test_X, test_y, model):
    feature_names, df = generate_features(test_X)
    acc = model.score(df[feature_names], test_y)
    return acc

In [104]:
def main():
    # read data
    train_X, train_y, dev_X, dev_y, test_X, test_y = loadData()
    print("Finish loading data")
    
    paras = [{'max_depth':100}]
    
    acc_dev, model = select_model(train_X, train_y, dev_X, dev_y, paras)
    print("Accuracy on the development set is:",acc_dev)
    
    acc_test = test(test_X, test_y, model)
    print("Accuracy on the test set is:",acc_test)

In [None]:
main()

In [None]:
############################################Helper function############################################################

In [107]:
def dis(row): 
    words1 = row['q1'].split(' ')
    words2 = row['q2'].split(' ')
    v1 = np.zeros((1,300))
    cnt1 = 0
    v2 = np.zeros((1,300))
    cnt2 = 0
    for word in words1:
        if word.lower() in words_vec:
            cnt1 += 1
            v1 += words_vec[word.lower()]
        else:
            print("Skip word ",word)
    for word in words2:
        if word.lower() in words_vec:
            cnt2 += 1
            v2 += words_vec[word.lower()]
        else:
            print("Skip word",word)
    return (v1 / cnt1 - v2  / cnt2).sum()