In [1]:
import sklearn
import random
import itertools
import nltk
import re
import numpy as np
import pandas as pd
import time
import gensim
import lzma

from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import LocallyLinearEmbedding, SpectralEmbedding, TSNE
from sklearn.ensemble import RandomTreesEmbedding

from textstat.textstat import textstat
from gensim import corpora
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem import *
from nltk import word_tokenize, ngrams
from nltk.tag import AffixTagger
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import Normalizer
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

seed = 1337

Using TensorFlow backend.


In [2]:
def lowercase(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].str.lower()
    return df

def unidecode(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].str.encode('ascii', 'ignore')
    return df

def remove_nonalpha(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].str.replace('\W+', ' ')
    return df

def repair_words(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: (''.join(''.join(s)[:2] for _, s in itertools.groupby(x))))
    return df

def concat_words(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: (' '.join(i for i in x)))
    return df



def tokenize(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: word_tokenize(x))
    return df

def ngram(df2, n):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in ngrams(word_tokenize(x), n)])
    return df

def skipgram(df2, ngram_n, skip_n):
    def random_sample(words_list, skip_n):
        return [words_list[i] for i in sorted(random.sample(range(len(words_list)), skip_n))]
    
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in ngrams(word_tokenize(x), ngram_n)])
        df[i] = df[i].apply(lambda x: random_sample(x, skip_n))
    return df

def chargram(df2, n):
    def chargram_generate(string, n):
        return [string[i:i+n] for i in range(len(string)-n+1)]
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in chargram_generate(x, 3)])
    return df



def remove_stops(df2, stopwords):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in word_tokenize(x) if i not in stopwords])
    return df

def remove_extremes(df2, stopwords, min_count = 3, max_frequency = 0.75):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in word_tokenize(x) if i not in stopwords])
    tokenized = []
    for i in text_feats:
        tokenized += df[i].tolist()
    dictionary = corpora.Dictionary(tokenized)
    dictionary.filter_extremes(no_below = min_count, no_above = max_frequency)
    dictionary.compactify()
    df = df2.copy()
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i for i in word_tokenize(x) if i not in stopwords and i not in 
                                      list(dictionary.token2id.keys())])
    return df



def chop(df2, n):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: [i[:n] for i in word_tokenize(x)])
    return df

def stem(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: ' '.join([stemmer.stem(i) for i in word_tokenize(x)]))
    return df

def lemmat(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in word_tokenize(x)]))
    return df

def extract_entity(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: word_tokenize(x))
        df[i] = df[i].apply(lambda x: nltk.pos_tag(x))
        df[i] = df[i].apply(lambda x: [i[1:] for i in x])
    return df



def doc_features(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['num_characters_{}'.format(i)] = df[col].map(lambda x: len(str(x))) # length of sentence
        df['num_words_{}'.format(i)] = df[col].map(lambda x: len(str(x).split())) # number of words
        df['num_spaces_{}'.format(i)] = df[col].map(lambda x: x.count(' '))
        df['num_alpha_{}'.format(i)] = df[col].apply(lambda x: sum(i.isalpha()for i in x))
        df['num_nonalpha_{}'.format(i)] = df[col].apply(lambda x: sum(1-i.isalpha()for i in x))
    return df

def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(lambda x: textstat.text_standard(x))
    return df

def bag_of_words(df2):
    df = df2.copy()
    cv = CountVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999)
    bow = cv.fit_transform(df.question1 + df.question2)
    return bow

def tf_idf(df2):
    df = df2.copy()
    tf = TfidfVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999)
    tfidf = tf.fit_transform(df.question1 + df.question2)
    return tfidf

def LDA_text2(df2, ntopics):
    cv = CountVectorizer(stop_words = 'english', min_df = 2, max_df = 0.99)
    lda = LatentDirichletAllocation(ntopics, random_state = seed, n_jobs = 1)
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    cv.fit(df.question1 + df.question2)
    bow = cv.transform(df.question1 + df.question2)
    lda.fit(bow)
    ldas = []
    for i in text_feats:
        bow_i = cv.transform(df[i])
        ldas.append(lda.transform(bow_i))
    return ldas

def SVD_text(df2, ndims):
    df = df2.copy()
    cv = CountVectorizer(stop_words = 'english', min_df = 2, max_df = 0.99)
    svd = TruncatedSVD(ndims, random_state = seed)
    text_feats = df.select_dtypes(include=['object']).columns.values
    cv.fit(df.question1 + df.question2)
    bow = cv.transform(df.question1 + df.question2)
    svd.fit(bow)
    svds = []
    for i in text_feats:
        bow_i = cv.transform(df[i])
        svd_i = svd.transform(bow_i)
        svds.append(svd_i)
    return svds

def LSA_text(df2, ndims):
    cv = CountVectorizer(stop_words = 'english', min_df = 2, max_df = 0.99)
    svd = TruncatedSVD(ndims, random_state = 1337)
    normalizer = Normalizer(copy = False)
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    cv.fit(df.question1 + df.question2)
    bow = cv.transform(df.question1 + df.question2)
    svd.fit(bow)
    transformed_bow = svd.transform(bow)
    normed_bow = normalizer.fit(transformed_bow)
    svds = []
    for i in text_feats:
        bow_i = cv.transform(df[i])
        svd_i = svd.transform(bow_i)
        normed_i = normalizer.transform(svd_i)
        svds.append(normed_i)
    return svds

In [None]:
stemmer = snowball.SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stopwords_eng = stopwords.words('english')
words = re.compile(r"\w+",re.I)

model = KeyedVectors.load_word2vec_format('/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/embeddings/GoogleNews-vectors-negative300.bin',                            
                                             binary=True)

In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

dftr2 = pd.read_csv('df_train_lemmatfullcleanSTEMMED.csv')
dfte2 = pd.read_csv('df_test_lemmatfullcleanSTEMMED.csv')

df_full = pd.concat((dftr2, dfte2))

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]
trdf.fillna('NULL', inplace = True)
tedf.fillna('NULL', inplace = True)

trdfs =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
tedfs =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]
trdfs.fillna('NULL', inplace = True)
tedfs.fillna('NULL', inplace = True)
print('Data loaded.')


Xtr = pd.DataFrame()
Xtr['EDITdistance_fullclean'] = trdf.apply(lambda x: 1 - seq_matcher(None, x['question1'], x['question2']).ratio(),
                                                axis = 1)   
Xtr['EDITdistance_fullcleanSTEM'] = trdfs.apply(lambda x: 1 - seq_matcher(None, x['question1'], x['question2']).ratio(),
                                                axis = 1)           
print('Training set done.')

Xte = pd.DataFrame()
Xte['EDITdistance_fullclean'] = tedf.apply(lambda x: 1 - seq_matcher(None, x['question1'], x['question2']).ratio(),
                                                axis = 1)     
Xte['EDITdistance_fullcleanSTEM'] = tedfs.apply(lambda x: 1 - seq_matcher(None, x['question1'], x['question2']).ratio(),
                                                axis = 1)     
print('Test set done.')

Xtr.to_csv('train_EDITdistance.csv', index = False)
Xte.to_csv('test_EDITdistance.csv', index = False)

In [13]:
def compression_distance(x,y,l_x=None,l_y=None):
    if x==y:
        return 0
    x_b = x.encode('utf-8')
    y_b = y.encode('utf-8')
    if l_x is None:
        l_x = len(lzma.compress(x_b))
        l_y = len(lzma.compress(y_b))
    l_xy = len(lzma.compress(x_b+y_b))
    l_yx = len(lzma.compress(y_b+x_b))
    dist = (min(l_xy,l_yx)-min(l_x,l_y))/max(l_x,l_y)
    return dist

X = pd.DataFrame()
X['compression_distance'] = dftr2.apply(lambda x: compression_distance(x['question1'], x['question2']),
                                                axis = 1)               
X.to_csv('train_LZMAcompression_distance.csv', index = False)
X

X = pd.DataFrame()
X['compression_distance'] = dfte2.apply(lambda x: compression_distance(x['question1'], x['question2']),
                                                axis = 1)               
X.to_csv('test_LZMAcompression_distance.csv', index = False)
X

Unnamed: 0,compression_distance
0,0.310345
1,0.322581
2,0.366667
3,0.300000
4,0.285714
5,0.333333
6,0.222222
7,0.424242
8,0.285714
9,0.472222


In [3]:
def split_traintest(l):
    train = []
    test = []
    for i in l:
        train.append(i[:dftr2.shape[0]])
        test.append(i[dftr2.shape[0]:])
    return train, test

def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    return norm_model.wmdistance(s1, s2)

def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    return model.wmdistance(s1, s2)

def extract_WMD(train = True):
    if train:
        print('Extracting WMD train distances.')
        tr_feats = pd.DataFrame()
        tr_feats['wmd_clean'] = trdf.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
        tr_feats['norm_wmd_clean'] = trdf.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
        tr_feats['wmd_cleanStemmed'] = trdfs.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
        tr_feats['norm_wmd_cleanStemmed'] = trdfs.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
        tr_feats.to_csv('train_WMD_cleaned_stemmed.csv', index = False)
        print('WMD distances extracted.')
        print(time.time() - t)
    else:
        print('Extracting WMD test distances.')
        te_feats = pd.DataFrame()
        te_feats['wmd_clean'] = tedf.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
        te_feats['norm_wmd_clean'] = tedf.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
        te_feats['wmd_cleanStemmed'] = tedfs.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
        te_feats['norm_wmd_cleanStemmed'] = tedfs.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
        te_feats.to_csv('test_WMD_cleaned_stemmed.csv', index = False)
        print('WMD distances extracted.')
        print(time.time() - t)
    return

def get_distances(transformation_name, question1_vectors, question2_vectors):
    data = pd.DataFrame()
    data['cosine_distance_{}'.format(transformation_name)] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['cityblock_distance_{}'.format(transformation_name)] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['jaccard_distance_{}'.format(transformation_name)] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['canberra_distance_{}'.format(transformation_name)] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['euclidean_distance_{}'.format(transformation_name)] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['minkowski_distance_{}'.format(transformation_name)] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['braycurtis_distance_{}'.format(transformation_name)] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
    np.nan_to_num(question2_vectors))]
    return data

def get_moments(transformation_name, question1_vectors, question2_vectors):
    data = pd.DataFrame()
    data['skew_{}'.format(transformation_name)] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_{}'.format(transformation_name)] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_{}'.format(transformation_name)] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_{}'.format(transformation_name)] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
    return data

Data loaded.
Models loaded.
Extracting WMD test distances.
WMD distances extracted.
12495.259793043137


In [None]:
t = time.time()

src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]
trdf.fillna('NULL', inplace = True)
tedf.fillna('NULL', inplace = True)

trdfs =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
tedfs =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]
trdfs.fillna('NULL', inplace = True)
tedfs.fillna('NULL', inplace = True)
print('Data loaded.')

src2 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/embeddings/'
model = gensim.models.KeyedVectors.load_word2vec_format(src2 + 'GoogleNews-vectors-negative300.bin', binary=True)
norm_model = gensim.models.KeyedVectors.load_word2vec_format(src2 + 'GoogleNews-vectors-negative300.bin', binary=True)
norm_model.init_sims(replace=True)
print('Models loaded.')

extract_WMD(train = False)

In [None]:
def SVD_text_tfidf(df2, ndims, gram_range, analyze = 'word'):
    df = df2.copy()
    tf = TfidfVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999, ngram_range = (1, gram_range),
                        analyzer = analyze)
    svd = TruncatedSVD(ndims, random_state = seed)
    text_feats = df.select_dtypes(include=['object']).columns.values
    tf.fit(df.question1 + df.question2)
    bow = tf.transform(df.question1 + df.question2)
    svd.fit(bow)
    svds = []
    for i in text_feats:
        bow_i = tf.transform(df[i])
        svd_i = svd.transform(bow_i)
        svds.append(svd_i)
    return svds

def LSA_text_tfidf(df2, ndims, gram_range, analyze = 'word'):
    tf = TfidfVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999, ngram_range = (1, gram_range),
                        analyzer = analyze)
    svd = TruncatedSVD(ndims, random_state = 1337)
    normalizer = Normalizer(copy = False)
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    tf.fit(df.question1 + df.question2)
    bow = tf.transform(df.question1 + df.question2)
    svd.fit(bow)
    transformed_bow = svd.transform(bow)
    normed_bow = normalizer.fit(transformed_bow)
    svds = []
    for i in text_feats:
        bow_i = tf.transform(df[i])
        svd_i = svd.transform(bow_i)
        normed_i = normalizer.transform(svd_i)
        svds.append(normed_i)
    return svds

def run_transforms(transformation_name, ndims, gram_range, analyze, test = False):
    lsa_dff = LSA_text_tfidf(df_full, ndims, gram_range, analyze)
    svd_dff = SVD_text_tfidf(df_full, ndims, gram_range, analyze)
    lsa_tr, lsa_te = split_traintest(lsa_dff)
    svd_tr, svd_te = split_traintest(svd_dff)
    tr_lsa_dist = get_distances('train_LSA_{}'.format(transformation_name), lsa_tr[0], lsa_tr[1])
    tr_svd_dist = get_distances('train_SVD_{}'.format(transformation_name), svd_tr[0], svd_tr[1])
    tr_lsa_dist.to_csv('train_LSA_{}.csv'.format(transformation_name), index = False)
    tr_svd_dist.to_csv('train_SVD_{}'.format(transformation_name), index = False)
    if test:
        te_lsa_dist = get_distances('test_LSA_{}'.format(transformation_name), lsa_te[0], lsa_te[1])
        te_svd_dist = get_distances('test_SVD_{}'.format(transformation_name), svd_te[0], svd_te[1])
        te_lsa_dist.to_csv('test_LSA_{}.csv'.format(transformation_name), index = False)
        te_svd_dist.to_csv('test_SVD_{}'.format(transformation_name), index = False)
    return

run_transforms('TFIDF_3grams_words_50dim', 50, 3, 'word', test = True)
run_transforms('TFIDF_5grams_words_50dim', 50, 5, 'word', test = True)