In [258]:
import csv
import config
import json
import os
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import random
import re
import scipy as sp
import xgboost as xgb

from functools import partial
from scipy.stats import pearsonr

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, \
    ExtraTreesClassifier, VotingClassifier
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import SVC

from utils.feature_utils import load_features, load_target, calibrate, log_loss_dup
from utils.file_utils import save_to_csv

In [2]:
%matplotlib inline

In [231]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def read_train_csv_as_rdd(name):
    return sc.parallelize([{
        'id': r.id, 'qid1': r.qid1, 'qid2': r.qid2, 'question1': r.question1, 'question2': r.question2, 'is_duplicate': r.is_duplicate
    } for r in pd.read_csv(name, keep_default_na=False).itertuples()])

def read_test_csv_as_rdd(name):
    return sc.parallelize([{
        'test_id': r.test_id, 'question1': r.question1, 'question2': r.question2
    } for r in pd.read_csv(name, keep_default_na=False).itertuples()])

In [14]:
train_df = pd.read_csv('data/train.csv', keep_default_na=False)
test_df = pd.read_csv('data/test.csv', keep_default_na=False)

In [160]:
with open('data/train.json', 'w') as train_json:
    for r in train_df.itertuples():
        json.dump(
            {
                'id': r.id, 'qid1': r.qid1, 'qid2': r.qid2, 'question1': r.question1, 'question2': r.question2, 'is_duplicate': r.is_duplicate
            },
            train_json)
        train_json.write('\n')

In [161]:
with open('data/test.json', 'w') as test_json:
    for r in test_df.itertuples():
        json.dump(
            {
                'test_id': r.test_id, 'question1': r.question1, 'question2': r.question2
            },
            test_json)
        test_json.write('\n')

## Preprocessing

In [5]:
from data_preprocess import preprocess_text

In [55]:
train_data = sc.textFile(os.path.join(config.DATA_DIR, 'train.json')).map(lambda r: json.loads(r))
test_data = sc.textFile(os.path.join(config.DATA_DIR, 'test.json')).map(lambda r: json.loads(r))

In [56]:
def clean_question(r):
    r['question1'] = preprocess_text(r['question1'])
    r['question2'] = preprocess_text(r['question2'])
    return r

cleaned_train_data = train_data.map(clean_question)
cleaned_test_data = test_data.map(clean_question)

In [57]:
save_to_csv(pd.read_json(json.dumps(cleaned_train_data.collect())), os.path.join(config.DATA_DIR, 'clean_train.csv'))
save_to_csv(pd.read_json(json.dumps(cleaned_test_data.collect())), os.path.join(config.DATA_DIR, 'clean_test.csv'))

## Feature

In [7]:
INCLUDE_TEST = True

In [8]:
def transform_feature(r, feature_transform):
    r[feature_transform.feature_name()] = feature_transform.transform(r['question1'], r['question2'])
    return r

def transform_all_feature(rdd, feature_transform, partition=100000):
    def generate_feature(data):
        new_data = []
        question1 = []
        question2 = []

        for i, r in enumerate(data):
            new_data.append(r)
            question1.append(r['question1'])
            question2.append(r['question2'])

            if i % partition == 0:
                feature = feature_transform.transform_all(question1, question2)
                for j, v in enumerate(feature):
                    new_data[j][feature_transform.feature_name()] = v
                    yield new_data[j]

                new_data = []
                question1 = []
                question2 = []
        
        if len(new_data) > 0:
            feature = feature_transform.transform_all(question1, question2)
            for j, v in enumerate(feature):
                new_data[j][feature_transform.feature_name()] = v
                yield new_data[j]

    return rdd.repartition(1).mapPartitions(generate_feature)

def corr(rdd, feature_transform):
    feature_name = feature_transform.feature_name()
    x_y = rdd.map(lambda r: (r[feature_name], r['is_duplicate'])).collect()
    r = pearsonr([x for (x, _) in x_y], [y for (_, y) in x_y])[0]
    if math.isnan(r):
        return 0
    else:
        return r
    
def save_feature_to_csv(rdd, feature_transform, data_format='train', partition=100000):
    feature_name = feature_transform.feature_name()
    def handle_data(data):
        with open(os.path.join(config.FEATURE_DIR, data_format, '%s.csv' % feature_name), 'wa') as out:
            out_data = []
            for i, r in enumerate(data):
                new_r = {'id': r['id']} if data_format == 'train' else {'test_id': r['test_id']}
                new_r[feature_name] = r[feature_name]
                out_data.append(new_r)
                if i == 0:
                    pd.read_json(json.dumps(out_data)).to_csv(out, index=False, quoting=csv.QUOTE_ALL, header=True)
                    out_data = []
                elif i % partition == 0:
                    print 'Testing: ', i
                    pd.read_json(json.dumps(out_data)).to_csv(out, index=False, quoting=csv.QUOTE_ALL, header=False)
                    out_data = []
            
            if len(out_data) > 0:
                pd.read_json(json.dumps(out_data)).to_csv(out, index=False, quoting=csv.QUOTE_ALL, header=False)
        
        yield 'Done'
                    

    if rdd.getNumPartitions() != 1:
        rdd = rdd.repartition(1)

    rdd.mapPartitions(handle_data).count()

def corpus_rdd(train_data, test_data):
    def yield_train_question(r):
        yield (r['qid1'], r['question1'])
        yield (r['qid2'], r['question2'])
    
    def yield_test_question(r):
        yield r['question1']
        yield r['question2']
    return train_data.flatMap(yield_train_question).\
        reduceByKey(lambda a, _: a).\
        values().\
        union(test_data.flatMap(yield_test_question)).\
        distinct()

def save_corpus(corpus):
    with open(os.path.join(config.DATA_DIR, 'corpus.txt'), 'w') as f:
        for question in corpus:
            f.write(question + '\n')
    
def read_corpus():
    corpus = []
    with open(os.path.join(config.DATA_DIR, 'corpus.txt')) as f:
        for line in f:
            corpus.append(line[:-1])
    return corpus

In [9]:
train_data = read_train_csv_as_rdd(os.path.join(config.DATA_DIR, 'clean_train.csv'))
test_data = read_test_csv_as_rdd(os.path.join(config.DATA_DIR, 'clean_test.csv'))

In [10]:
corpus_data = corpus_rdd(train_data, test_data)

In [11]:
corpus_data.count()

4780574

### Jaccard feature

In [64]:
from feature_jaccard import JaccardDistanceTransform

In [72]:
def generate_jaccard_feature(include_test = False):
    ngrams = [1, 2, 3]
    for ngram in ngrams:
        jaccard_distance_transform = JaccardDistanceTransform(ngram)
        train_data_features = train_data.\
            map(lambda r: transform_feature(r, jaccard_distance_transform))
        print "Corr for %d-gram: %f" % (ngram, corr(train_data_features, jaccard_distance_transform))
        save_feature_to_csv(train_data_features, jaccard_distance_transform, 'train')

        if include_test:
            test_data_features = test_data.\
                map(lambda r: transform_feature(r, jaccard_distance_transform))
            save_feature_to_csv(test_data_features, jaccard_distance_transform, 'test')

generate_jaccard_feature(INCLUDE_TEST)

Corr for 1-gram: 0.346723
Partitions:  8
Testing:  1
Testing:  1
Corr for 2-gram: 0.220864
Partitions:  8
Testing:  1
Testing:  1
Corr for 3-gram: 0.142528
Partitions:  8
Testing:  1
Testing:  1


In [None]:
# Corr for 1-gram: 0.346723
# Corr for 2-gram: 0.220864
# Corr for 3-gram: 0.142528

### TFIDF feature

In [77]:
from feature_vector import TfidfCosineSimilarityTransform, TfidfCharCosineSimilarityTransform

In [80]:
def generate_tfidf_cosince_similarity_feature(include_test=False):
    ngrams = [1, 2, 3]
    for ngram in ngrams:
        tfidf_cosine_similarity_transform = TfidfCosineSimilarityTransform(corpus_data.collect(), ngram)
        train_data_features = transform_all_feature(train_data, tfidf_cosine_similarity_transform, partition=50000)
        print "Corr for %d-gram: %f" % (ngram, corr(train_data_features, tfidf_cosine_similarity_transform))
        save_feature_to_csv(train_data_features, tfidf_cosine_similarity_transform, 'train')

        if include_test:
            test_data_features = transform_all_feature(test_data, tfidf_cosine_similarity_transform, partition=50000)
            save_feature_to_csv(test_data_features, tfidf_cosine_similarity_transform, 'test')

generate_tfidf_cosince_similarity_feature(INCLUDE_TEST)

Corr for 1-gram: 0.407039
Corr for 2-gram: 0.209158
Corr for 3-gram: 0.149541


In [None]:
# Corr for 1-gram: 0.407039
# Corr for 2-gram: 0.209158
# Corr for 3-gram: 0.149541

In [None]:
def generate_tfidf_char_cosince_similarity_feature(include_test=False):
    ngrams = [3]
    for ngram in ngrams:
        tfidf_char_cosine_similarity_transform = TfidfCharCosineSimilarityTransform(corpus_data.collect(), ngram)
        train_data_features = transform_all_feature(train_data, tfidf_char_cosine_similarity_transform)
        print "Corr for %d-gram: %f" % (ngram, corr(train_data_features, tfidf_char_cosine_similarity_transform))
        save_feature_to_csv(train_data_features, tfidf_char_cosine_similarity_transform, 'train')

        if include_test:
            test_data_features = transform_all_feature(test_data, tfidf_char_cosine_similarity_transform)
            save_feature_to_csv(test_data_features, tfidf_char_cosine_similarity_transform, 'test')

generate_tfidf_char_cosince_similarity_feature(INCLUDE_TEST)

### LSA feature

In [81]:
from feature_vector import LSACosineSimilarityTransform

In [82]:
def generate_lsa_cosince_similarity_feature(include_test=False):
    ngrams = [1, 2, 3]
    for ngram in ngrams:
        lsa_cosine_similarity_transform = LSACosineSimilarityTransform(corpus_data.collect(), ngram)
        train_data_features = transform_all_feature(train_data, lsa_cosine_similarity_transform)
        print "Corr for %d-gram: %f" % (ngram, corr(train_data_features, lsa_cosine_similarity_transform))
        save_feature_to_csv(train_data_features, lsa_cosine_similarity_transform, 'train')

        if include_test:
            test_data_features = transform_all_feature(test_data, lsa_cosine_similarity_transform)
            save_feature_to_csv(test_data_features, lsa_cosine_similarity_transform, 'test')

generate_lsa_cosince_similarity_feature(INCLUDE_TEST)

Corr for 1-gram: 0.301804
Corr for 2-gram: 0.182017
Corr for 3-gram: 0.159962


In [None]:
# Corr for 1-gram: 0.301804
# Corr for 2-gram: 0.182017
# Corr for 3-gram: 0.159962

## Stacking features

In [225]:
from utils.feature_utils import generate_stacking_feature

In [230]:
train_features = load_features()
train_target = load_target()
dev_train_features, dev_test_features, dev_train_target, dev_test_target = train_test_split(
    train_features, train_target, test_size=config.TEST_SIZE, random_state=config.RANDOM_SEED
)

### Random Forest Stacking

In [233]:
from feature_random_forest_stacking import RandomForestStacking

In [None]:
generate_stacking_feature(RandomForestStacking(), train_features, train_target, load_features('test'))

### XGB Stacking

In [235]:
from feature_xgb_stacking import XGBStacking

In [236]:
generate_stacking_feature(XGBStacking(), train_features, train_target, load_features('test'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_stacking_features[stacking.feature_name()] = stacking.fit_transform(train_X, train_y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test_stacking_features[stacking.feature_name()] = stacking.transform(test_X)


### Gradient Boosting Stacking

In [237]:
from feature_gradient_boosting_stacking import GradientBoostingStacking

In [238]:
generate_stacking_feature(GradientBoostingStacking(), train_features, train_target, load_features('test'))

### Extra Tree Stacking

In [242]:
from feature_extra_trees_stacking import ExtraTreesStacking

In [244]:
generate_stacking_feature(ExtraTreesStacking(), train_features, train_target, load_features('test'))

## Model

In [35]:
# def load_other_features():
#     features = ['counts', 'fuzzy_word', 'interrogative_forms', 'recall_precision_talmi']
    
#     feature_df = None
#     for feature in features:
#         for root, dirs, filenames in os.walk(os.path.join(config.FEATURE_DIR, feature)):
#             for filename in filenames:
#                 if filename.endswith('.csv'):
#                     print os.path.join(root, filename)
#                     df = pd.read_csv(os.path.join(root, filename))
#                     if feature_df is None:
#                         feature_df = df
#                     else:
#                         feature_df = feature_df.merge(df, on='id')
#     return feature_df

# train_other_features = load_other_features()

data/features/counts/part-00000-counts.csv
data/features/fuzzy_word/part-00000-b6cc8e1c-9d21-4868-adfc-22534a908bb2.csv
data/features/interrogative_forms/part-00000-066ad0c6-1dfb-481f-8b1b-57ca6190ba9c.csv
data/features/recall_precision_talmi/part-00000-cd79d36b-bd37-4436-9096-39d7c42c03a8.csv


In [63]:
def load_count_features():
    dfs = []
    for root, dirs, filenames in os.walk(os.path.join(config.FEATURE_DIR, 'counts')):
        for filename in filenames:
            if filename.endswith('.csv'):
                dfs.append(pd.read_csv(os.path.join(root, filename)))
    return pd.concat(dfs)

train_count_features = load_count_features()

In [245]:
train_features = load_features(feature_type='all')
train_target = load_target()
dev_train_features, dev_test_features, dev_train_target, dev_test_target = train_test_split(
    train_features, train_target, test_size=config.TEST_SIZE, random_state=config.RANDOM_SEED
)

In [248]:
rf = RandomForestClassifier(n_estimators=100, random_state=config.RANDOM_SEED)
rf.fit(dev_train_features.drop('id', 1), dev_train_target.drop('id', 1))

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=2017,
            verbose=0, warm_start=False)

In [249]:
log_loss_dup(dev_test_target.drop('id', 1), rf.predict_proba(dev_test_features.drop('id', 1)))

0.45057960035246419

In [250]:
xgb_params = {
    'max_depth': 9, 
    'learning_rate': 0.1,
    'n_estimators': 500, 
    'objective': 'binary:logistic',
    'nthread': 16, 
    'gamma': 0, 
    'subsample': 0.75, 
    'colsample_bytree': 0.75, 
    'colsample_bylevel': 1,
    'reg_alpha': 0, 
    'reg_lambda': 1, 
    'scale_pos_weight': 1
}
xgb_cls = xgb.XGBClassifier(**xgb_params)
xgb_cls.fit(dev_train_features.drop('id', 1), dev_train_target.drop('id', 1))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.75,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=500, nthread=16,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.75)

In [251]:
log_loss_dup(dev_test_target.drop('id', 1), xgb_cls.predict_proba(dev_test_features.drop('id', 1)))

0.44462600520222095

In [241]:
gbc = GradientBoostingClassifier(n_estimators=500, random_state=config.RANDOM_SEED)
gbc.fit(dev_train_features.drop('id', 1), dev_train_target.drop('id', 1))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=2017,
              subsample=1.0, verbose=0, warm_start=False)

In [243]:
log_loss_dup(dev_test_target.drop('id', 1), gbc.predict_proba(dev_test_features.drop('id', 1)))

0.47078095238119649

In [108]:
abc = AdaBoostClassifier(n_estimators=300, learning_rate=0.0001, random_state=config.RANDOM_SEED)
abc.fit(dev_train_features.drop('id', 1), dev_train_target.drop('id', 1))

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.0001, n_estimators=300, random_state=2017)

In [109]:
log_loss_dup(dev_test_target.drop('id', 1), abc.predict_proba(dev_test_features.drop('id', 1)))

0.55022798901810421

In [127]:
etc = ExtraTreesClassifier(
    n_estimators=100,
    criterion='entropy',
    min_samples_leaf=2,
    random_state=config.RANDOM_SEED)
etc.fit(dev_train_features.drop('id', 1), dev_train_target.drop('id', 1))

  import sys


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-06, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=2017,
           verbose=0, warm_start=False)

In [128]:
log_loss_dup(dev_test_target.drop('id', 1), etc.predict_proba(dev_test_features.drop('id', 1)))

0.45021462510499716

## Submission

In [274]:
from utils.feature_utils import calibrate

In [257]:
train_features = load_features(feature_type='all')
train_target = load_target()
test_features = load_features(data_format='test', feature_type='all')

In [261]:
final_model = RandomForestClassifier(n_estimators=200)
final_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=config.RANDOM_SEED)),
        ('xgb', xgb.XGBClassifier(**xgb_params))
    ],
    voting='soft',
    weights=[0.5, 0.5])
final_model.fit(train_features.drop('id', 1), train_target.drop('id', 1))

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...istic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.75))],
         n_jobs=1, voting='soft', weights=[0.5, 0.5])

In [263]:
log_loss_dup(train_target.drop('id', 1), final_model.predict_proba(train_features.drop('id', 1)))

0.22524900234832992

In [266]:
test_submission = test_features[['test_id']]

In [279]:
test_submission['is_duplicate'] = final_model.predict_proba(test_features.drop('test_id', 1))[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [284]:
test_submission['is_duplicate'] = test_submission['is_duplicate'].apply(lambda r: calibrate(r))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [285]:
save_to_csv(test_submission, os.path.join(config.DATA_DIR, 'submission', 'submission.csv'), quoting=csv.QUOTE_MINIMAL)

## Output feature

In [100]:
save_to_csv(load_features('train'), os.path.join(config.DATA_DIR, 'train_vincent.csv', 'part-00000-vincent.csv'), quoting=csv.QUOTE_MINIMAL)
save_to_csv(load_features('test'), os.path.join(config.DATA_DIR, 'test_vincent.csv', 'part-00000-vincent.csv'), quoting=csv.QUOTE_MINIMAL)