In [1]:
# Load modules for data accessing and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
# Load NLP modules
import re
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams

In [3]:
df_train = pd.read_csv('../data/train.tsv', sep='\t')
df_train.set_index('train_id')
df_train.rename(index=str, columns={"train_id": "ID", "item_description": "desc"}, inplace=True)
df_train.head()

Unnamed: 0,ID,name,item_condition_id,category_name,brand_name,price,shipping,desc
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
#####################
## Preprocess data ##
#####################
#token_pattern = "[a-zA-Z'`]+"
token_pattern = "[a-zA-Z0-9'`]+"
#token_pattern = r"(?u)\b\w\w+\b"
def TextPreProcessing(line,
                      token_pattern=token_pattern):
    ## tokenize
    tokenizer = RegexpTokenizer(token_pattern)
    tokens = tokenizer.tokenize( str(line).lower() )
    return tokens

def Myngrams(text, nfold):
    s = []
    for ngram in ngrams(text, nfold):
        s.append(' '.join(str(i) for i in ngram))

    #s = list(set(s)) # unique string in the list
    return s

#print(Myngrams(TextPreProcessing('This is a token. This is a sentence. We are data scientists!', token_pattern), 2))

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

def TryDivide(x, y, val=0.0):
    """ 
    Try to divide two numbers
    """
    if y != 0.0:
        val = float(x) / y
    return val

def GetPositionList(tgt, obs):
    """
    Get the list of positions of obs in target
    """
    pos_of_obs_in_tgt = [0]
    if len(obs) != 0:
        pos_of_obs_in_tgt = [j for j,w in enumerate(obs, start=1) if w in tgt]
        if len(pos_of_obs_in_tgt) == 0:
            pos_of_obs_in_tgt = [0]
    #print(pos_of_obs_in_tgt)
    return pos_of_obs_in_tgt

def DumpTextBasicNgram(df):
    ## 1-gram
    print("Generate 1-gram...")
    df["name_1gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 1), axis=1))
    df["desc_1gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 1), axis=1))
    ## 2-gram
    print("Generate 2-gram...")
    df["name_2gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 2), axis=1))
    df["desc_2gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 2), axis=1))
    ## 3-gram
    print("Generate 3-gram...")
    df["name_3gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 3), axis=1))
    df["desc_3gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 3), axis=1))

    return ;
    
def DumpTextCountFeatures(df):
    ################################
    ## text count and digit count ##
    ################################
    print("Generate basic text count features...")
    fnames = ["name", "desc"]
    ngrams = ["1gram", "2gram", "3gram"]
    CountDigit = lambda x: sum([1. for w in x if w.isdigit()])
    
    for fname in fnames:
        for ngram in ngrams:
            ## word count
            df["Count_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: len(x[fname + "_" + ngram]), axis=1))
            df["CountUnique_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: len(set(x[fname + "_" + ngram])), axis=1))
            df["RatioUnique_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: TryDivide(x["CountUnique_%s_%s"%(fname, ngram)], x["Count_%s_%s"%(fname, ngram)]), axis=1))
        ## digit count
        df["CountDigit_%s"%fname] = list(df.apply(lambda x: CountDigit(x[fname + "_1gram"]), axis=1))
        df["RatioDigit_%s"%fname] = list(df.apply(lambda x: TryDivide(x["CountDigit_%s"%fname], x["Count_%s_1gram"%(fname)]), axis=1))
    '''
    ## description missing indicator
    #df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
    '''
    
    ##########################
    ## intersect text count ##
    ##########################
    print("Generate intersect word count features...")

    
    for ngram in ngrams:
        for oname in fnames:
            for tname in fnames:
                if tname != oname:
                    df["Count_%s_%s_In_%s"%(oname, ngram, tname)] = list(df.apply(lambda x: sum([1. for w in x[oname + "_" + ngram] if w in set(x[tname + "_" + ngram])]), axis=1))
                    df["Ratio_%s_%s_In_%s"%(oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Count_%s_%s_In_%s"%(oname, ngram, tname)], x["Count_%s_%s"%(oname, ngram)]), axis=1))
    
        ## some other features
        #df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        #df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        #df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        #df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])
 
    ######################################
    ## intersect word position feat ##
    ######################################
    print("Generate intersect word position features...")
    for ngram in ngrams:
        for oname in fnames:
            for tname in fnames:
                if tname != oname:
                    pos = list(df.apply(lambda x: GetPositionList(x[tname+"_"+ngram], obs=x[oname+"_"+ngram]), axis=1))
                    ## stats feat on pos
                    df["Pos_%s_%s_In_%s_min" % (oname, ngram, tname)] = [np.min(p) for p in pos]
                    df["Pos_%s_%s_In_%s_ave" % (oname, ngram, tname)] = [np.mean(p) for p in pos]
                    df["Pos_%s_%s_In_%s_med" % (oname, ngram, tname)] = [np.median(p) for p in pos]
                    df["Pos_%s_%s_In_%s_max" % (oname, ngram, tname)] = [np.max(p) for p in pos]
                    df["Pos_%s_%s_In_%s_std" % (oname, ngram, tname)] = [np.std(p) for p in pos]
                    ## stats feat on normalized_pos
                    df["NormedPos_%s_%s_In_%s_min" % (oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Pos_%s_%s_In_%s_min" % (oname, ngram, tname)], x["Count_%s_%s" % (oname, ngram)]), axis=1))
                    df["NormedPos_%s_%s_In_%s_ave" % (oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Pos_%s_%s_In_%s_ave" % (oname, ngram, tname)], x["Count_%s_%s" % (oname, ngram)]), axis=1))
                    df["NormedPos_%s_%s_In_%s_med" % (oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Pos_%s_%s_In_%s_med" % (oname, ngram, tname)], x["Count_%s_%s" % (oname, ngram)]), axis=1))
                    df["NormedPos_%s_%s_In_%s_max" % (oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Pos_%s_%s_In_%s_max" % (oname, ngram, tname)], x["Count_%s_%s" % (oname, ngram)]), axis=1))
                    df["NormedPos_%s_%s_In_%s_std" % (oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Pos_%s_%s_In_%s_std" % (oname, ngram, tname)], x["Count_%s_%s" % (oname, ngram)]), axis=1))
                   
                    #df["NormedPos_%s_%s_In_%s_ave" % (oname, ngram, tname)] = map(TryDivide, df["Pos_%s_%s_In_%s_ave" % (oname, ngram, tname)], df["Count_%s_%s" % (oname, ngram)])
                    #df["NormedPos_%s_%s_In_%s_med" % (oname, ngram, tname)] = map(TryDivide, df["Pos_%s_%s_In_%s_med" % (oname, ngram, tname)], df["Count_%s_%s" % (oname, ngram)])
                    #df["NormedPos_%s_%s_In_%s_max" % (oname, ngram, tname)] = map(TryDivide, df["Pos_%s_%s_In_%s_max" % (oname, ngram, tname)], df["Count_%s_%s" % (oname, ngram)])
                    #df["NormedPos_%s_%s_In_%s_std" % (oname, ngram, tname)] = map(TryDivide, df["Pos_%s_%s_In_%s_std" % (oname, ngram, tname)], df["Count_%s_%s" % (oname, ngram)])


    #df.drop(['name_1gram', 'desc_1gram', 'name_2gram', 'desc_2gram', 'name_3gram', 'desc_3gram'], axis=1, inplace=True)
    
    return

#sub_df_train = df_train.iloc[:50, :]
#DumpTextBasicNgram(sub_df_train)
#DumpTextCountFeatures(sub_df_train)
#sub_df_train.head(20)

In [8]:
def CalcDist(A, B, tdist="Jaccard_Coef"):
    A, B = set(A), set(B)
    numerator = len(A.intersection(B))
        
    if tdist == "Jaccard_Coef":
        denominator = len(A.union(B))
        d = TryDivide(numerator, denominator)
    elif tdist == "Dice_Coef":
        denominator = len(A) + len(B)
        d = TryDivide(2*numerator, denominator)
    elif tdist == "Overlap_Coef":
        denominator = np.min([len(A), len(B)])
        d = TryDivide(numerator, denominator)
    return d


def DumpTextDistanceFeatures(df):
    ######################################
    ## Generate basic distance features ##
    ######################################   
    ## jaccard coef, dice dist and overlap dist of n-gram
    print("Generate basic distance... Jaccard coefficient, Dice coefficient, Overlap coefficient ...")
    tdists = ["Jaccard_Coef", "Dice_Coef", "Overlap_Coef"]
    ngrams = ["1gram", "2gram", "3gram"]
    fnames = ["name", "desc"]

    for tdist in tdists:
        for ngram in ngrams:
            for i in range(len(fnames)-1):
                for j in range(i+1,len(fnames)):
                    nameone = fnames[i]
                    nametwo = fnames[j]
                    df["%s_%s_Between_%s_%s"%(tdist, ngram, nameone, nametwo)] = list(df.apply(lambda x: CalcDist(x[nameone+"_"+ngram], x[nametwo+"_"+ngram], tdist), axis=1))
    
    
    ############################################
    ## Generate statistical distance features ##
    ############################################
    
    return

sub_df_train = df_train.iloc[:50, :]
DumpTextBasicNgram(sub_df_train)
DumpTextDistanceFeatures(sub_df_train)
sub_df_train.head(20)

Generate 1-gram...
Generate 2-gram...
Generate 3-gram...
Generate basic distance... Jaccard coefficient, Dice coefficient, Overlap coefficient ...


Unnamed: 0,ID,name,item_condition_id,category_name,brand_name,price,shipping,desc,name_1gram,desc_1gram,...,desc_3gram,Jaccard_Coef_1gram_Between_name_desc,Jaccard_Coef_2gram_Between_name_desc,Jaccard_Coef_3gram_Between_name_desc,Dice_Coef_1gram_Between_name_desc,Dice_Coef_2gram_Between_name_desc,Dice_Coef_3gram_Between_name_desc,Overlap_Coef_1gram_Between_name_desc,Overlap_Coef_2gram_Between_name_desc,Overlap_Coef_3gram_Between_name_desc
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,"[mlb, cincinnati, reds, t, shirt, size, xl]","[no, description, yet]",...,[no description yet],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,"[razer, blackwidow, chroma, keyboard]","[this, keyboard, is, in, great, condition, and...",...,"[this keyboard is, keyboard is in, is in great...",0.0625,0.0,0.0,0.117647,0.0,0.0,0.5,0.0,0.0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,"[ava, viv, blouse]","[adorable, top, with, a, hint, of, lace, and, ...",...,"[adorable top with, top with a, with a hint, a...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,"[leather, horse, statues]","[new, with, tags, leather, horses, retail, for...",...,"[new with tags, with tags leather, tags leathe...",0.030303,0.0,0.0,0.058824,0.0,0.0,0.333333,0.0,0.0
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,"[24k, gold, plated, rose]","[complete, with, certificate, of, authenticity]",...,"[complete with certificate, with certificate o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma...","[bundled, items, requested, for, ruie]","[banana, republic, bottoms, candies, skirt, wi...",...,"[banana republic bottoms, republic bottoms can...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...,"[acacia, pacific, tides, santorini, top]","[size, small, but, straps, slightly, shortened...",...,"[size small but, small but straps, but straps ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...,"[girls, cheer, and, tumbling, bundle, of, 7]","[you, get, three, pairs, of, sophie, cheer, sh...",...,"[you get three, get three pairs, three pairs o...",0.095238,0.0,0.0,0.173913,0.0,0.0,0.571429,0.0,0.0
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.,"[girls, nike, pro, shorts]","[girls, size, small, plus, green, three, short...",...,"[girls size small, size small plus, small plus...",0.2,0.0,0.0,0.333333,0.0,0.0,0.5,0.0,0.0
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,I realized his pants are on backwards after th...,"[porcelain, clown, doll, checker, pants, vtg]","[i, realized, his, pants, are, on, backwards, ...",...,"[i realized his, realized his pants, his pants...",0.075472,0.0,0.0,0.140351,0.0,0.0,0.666667,0.0,0.0
