In [1]:
%load_ext klab-autotime

In [2]:
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as df

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from tqdm import trange
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['title']:
        if word not in stops:
            q2words[word] = 1
    for word in row['qurey']:
        if word not in stops:
            q1words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['qurey']).intersection(set(row['title']))
    uw = set(row['qurey']).union(row['title'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['qurey']).intersection(set(row['title'])))
    
def len_query(row):
    return len(row['qurey'])

def len_title(row):
    return len(row['title'])

def total_unique_words(row):
    return len(set(row['qurey']).union(row['title']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['qurey']).union(row['title']) if x not in stops])

def wc_diff(row):
    return abs(len(row['qurey']) - len(row['title']))

def wc_ratio(row):
    l1 = len(row['qurey'])*1.0 
    l2 = len(row['title'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['qurey'])) - len(set(row['title'])))

def wc_ratio_unique(row):
    l1 = len(set(row['qurey'])) * 1.0
    l2 = len(set(row['title']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['qurey']) if x not in stops]) - len([x for x in set(row['title']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['qurey']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['title']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['qurey'] or not row['title']:
        return np.nan
    return int(row['qurey'][0] == row['title'][0])

def char_diff(row):
    return abs(len(''.join(row['qurey'])) - len(''.join(row['title'])))

def char_ratio(row):
    l1 = len(''.join(row['qurey'])) 
    l2 = len(''.join(row['title']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['qurey']) if x not in stops])) - len(''.join([x for x in set(row['title']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['title']:
        if word not in stops:
            q2words[word] = 1
    for word in row['qurey']:
        if word not in stops:
            q1words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['qurey']:
        q1words[word] = 1
    for word in row['title']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = df.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    
    
    #X['len_query'] = data.apply(len_query, axis=1, raw=True)
    #X['len_title'] = data.apply(len_title, axis=1, raw=True)
    #X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['word_match'] = data.progress_apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.progress_apply(f, axis=1, raw=True) #2

    # f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    # X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.progress_apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.progress_apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.progress_apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.progress_apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.progress_apply(wc_ratio_unique, axis=1, raw=True) #8

    # f = functools.partial(wc_diff_unique_stop, stops=stops)    
    # X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    # f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    # X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.progress_apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.progress_apply(char_diff, axis=1, raw=True) #12

    # f = functools.partial(char_diff_unique_stop, stops=stops) 
    # X['char_diff_unq_stop'] = data.progress_apply(f, axis=1, raw=True) #13

    X['total_unique_words'] = data.progress_apply(total_unique_words, axis=1, raw=True)  #15

    # f = functools.partial(total_unq_words_stop, stops=stops)
    # X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.progress_apply(char_ratio, axis=1, raw=True) #17    

    return X

time: 859 ms


In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

time: 3.83 ms


In [4]:
import os
Data_path='/home/kesci/input/bytedance/'
train_path=os.path.join(Data_path,"train_final.csv")
df_train=df.read_csv(train_path,header=None,skiprows=900000000,nrows =30000000)#,skiprows=980000000,nrows =20000000,skiprows=90000000
df_train = reduce_mem_usage(df_train)
'''
test_path=os.path.join(Data_path,"test_final_part1.csv")
df_test=df.read_csv(test_path,header=None)
df_test = reduce_mem_usage(df_test)
'''

Mem. usage decreased to 629.43 Mb (45.0% reduction)


'\ntest_path=os.path.join(Data_path,"test_final_part1.csv")\ndf_test=df.read_csv(test_path,header=None)\ndf_test = reduce_mem_usage(df_test)\n'

time: 12min 46s


In [5]:
df_train.columns=["query_id","qurey","query_title_id","title","label"]
#df_test.columns=["query_id","qurey","query_title_id","title"]

time: 1 ms


In [6]:
'''
df_test['qurey'] = df_test['qurey'].str.split()
df_test['title'] = df_test['title'].str.split()
test_qs = df.Series(df_test['qurey'].tolist() + df_test['title'].tolist())
words = [x for y in test_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
print('Building Features')
stops=[]
X_test = build_features(df_test, stops, weights)
'''

"\ndf_test['qurey'] = df_test['qurey'].str.split()\ndf_test['title'] = df_test['title'].str.split()\ntest_qs = df.Series(df_test['qurey'].tolist() + df_test['title'].tolist())\nwords = [x for y in test_qs for x in y]\ncounts = Counter(words)\nweights = {word: get_weight(count) for word, count in counts.items()}\nprint('Building Features')\nstops=[]\nX_test = build_features(df_test, stops, weights)\n"

time: 2.04 ms


In [7]:
df_train['qurey'] = df_train['qurey'].str.split()
df_train['title'] = df_train['title'].str.split()
train_qs = df.Series(df_train['qurey'].tolist() + df_train['title'].tolist())
words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
print('Building Features')
stops=[]
X_train = build_features(df_train, stops, weights)

  0%|          | 0/30000000 [00:00<?, ?it/s]

Building Features


100%|██████████| 30000000/30000000 [20:50<00:00, 23989.10it/s]
100%|██████████| 30000000/30000000 [41:34<00:00, 12028.10it/s]
100%|██████████| 30000000/30000000 [32:05<00:00, 15580.09it/s]
100%|██████████| 30000000/30000000 [17:21<00:00, 28805.61it/s]
100%|██████████| 30000000/30000000 [17:20<00:00, 28820.62it/s]
100%|██████████| 30000000/30000000 [18:10<00:00, 27508.59it/s]
100%|██████████| 30000000/30000000 [18:27<00:00, 27079.78it/s]
100%|██████████| 30000000/30000000 [29:57<00:00, 16693.05it/s]
100%|██████████| 30000000/30000000 [18:06<00:00, 27615.08it/s]
100%|██████████| 30000000/30000000 [18:29<00:00, 27027.69it/s]
100%|██████████| 30000000/30000000 [17:37<00:00, 28369.91it/s]

time: 4h 13min 29s





In [8]:
X_train=reduce_mem_usage(X_train)
#X_test=reduce_mem_usage(X_test)

Mem. usage decreased to 600.81 Mb (76.1% reduction)
time: 8.19 s


In [9]:
'''
data = open('test_feature.pkl', 'wb')
import pickle
pickle.dump(X_test, file=data)
'''
data = open('train_feature_first.pkl', 'wb')
import pickle
pickle.dump(X_train, file=data)

time: 1.29 s
