In [1]:
# Load modules for data accessing and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter

In [2]:
# Load NLP modules
import re
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams

In [3]:
df_train = pd.read_csv('../data/train.tsv', sep='\t')
df_train.set_index('train_id')
df_train.rename(index=str, columns={"train_id": "ID", "item_description": "desc"}, inplace=True)
df_train.head()

Unnamed: 0,ID,name,item_condition_id,category_name,brand_name,price,shipping,desc
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
#####################
## Preprocess data ##
#####################
#token_pattern = "[a-zA-Z'`]+"
token_pattern = "[a-zA-Z0-9'`]+"
#token_pattern = r"(?u)\b\w\w+\b"
def TextPreProcessing(line,
                      token_pattern=token_pattern):
    ## tokenize
    tokenizer = RegexpTokenizer(token_pattern)
    tokens = tokenizer.tokenize( str(line).lower() )
    return tokens

def Myngrams(text, nfold):
    s = []
    for ngram in ngrams(text, nfold):
        s.append(' '.join(str(i) for i in ngram))

    #s = list(set(s)) # unique string in the list
    return s

#print(Myngrams(TextPreProcessing('This is a token. This is a sentence. We are data scientists!', token_pattern), 2))

In [10]:
pd.options.mode.chained_assignment = None  # default='warn'

def TryDivide(x, y, val=0.0):
    """ 
    Try to divide two numbers
    """
    if y != 0.0:
        val = float(x) / y
    return val

def DumpTextCountFeatures(df):
    ## 1-gram
    print("Generate 1-gram...")
    df["name_1gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 1), axis=1))
    df["desc_1gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 1), axis=1))
    ## 2-gram
    print("Generate 2-gram...")
    df["name_2gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 2), axis=1))
    df["desc_2gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 2), axis=1))
    ## 3-gram
    print("Generate 3-gram...")
    df["name_3gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["name"]), 3), axis=1))
    df["desc_3gram"] = list(df.apply(lambda x: Myngrams(TextPreProcessing(x["desc"]), 3), axis=1))
    
    ################################
    ## text count and digit count ##
    ################################
    print("Generate basic text count features...")
    fnames = ["name", "desc"]
    ngrams = ["1gram", "2gram", "3gram"]
    CountDigit = lambda x: sum([1. for w in x if w.isdigit()])
    
    for fname in fnames:
        for ngram in ngrams:
            ## word count
            df["Count_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: len(x[fname + "_" + ngram]), axis=1))
            df["CountUnique_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: len(set(x[fname + "_" + ngram])), axis=1))
            df["RatioUnique_%s_%s"%(fname, ngram)] = list(df.apply(lambda x: TryDivide(x["CountUnique_%s_%s"%(fname, ngram)], x["Count_%s_%s"%(fname, ngram)]), axis=1))
        ## digit count
        df["CountDigit_%s"%fname] = list(df.apply(lambda x: CountDigit(x[fname + "_1gram"]), axis=1))
        df["RatioDigit_%s"%fname] = list(df.apply(lambda x: TryDivide(x["CountDigit_%s"%fname], x["Count_%s_1gram"%(fname)]), axis=1))
    '''
    ## description missing indicator
    #df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
    '''
    
    ##########################
    ## intersect text count ##
    ##########################
    print("Generate intersect word count features...")

    
    for ngram in ngrams:
        for oname in fnames:
            for tname in fnames:
                if tname != oname:
                    df["Count_%s_%s_In_%s"%(oname, ngram, tname)] = list(df.apply(lambda x: sum([1. for w in x[oname + "_" + ngram] if w in set(x[tname + "_" + ngram])]), axis=1))
                    df["Ratio_%s_%s_In_%s"%(oname, ngram, tname)] = list(df.apply(lambda x: TryDivide(x["Count_%s_%s_In_%s"%(oname, ngram, tname)], x["Count_%s_%s"%(oname, ngram)]), axis=1))
    
        ## some other features
        #df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        #df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        #df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        #df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])
    
    df.drop(['name_1gram', 'desc_1gram', 'name_2gram', 'desc_2gram', 'name_3gram', 'desc_3gram'], axis=1, inplace=True)
    
    return

sub_df_train = df_train.iloc[:50, :]
DumpTextCountFeatures(sub_df_train)
sub_df_train.head(20)

#DumpTextCountFeatures(df_train)
#df_train.head()

Generate 1-gram...
Generate 2-gram...
Generate 3-gram...
Generate basic text count features...
Generate intersect word count features...


Unnamed: 0,ID,name,item_condition_id,category_name,brand_name,price,shipping,desc,Count_name_1gram,CountUnique_name_1gram,...,Count_desc_1gram_In_name,Ratio_desc_1gram_In_name,Count_name_2gram_In_desc,Ratio_name_2gram_In_desc,Count_desc_2gram_In_name,Ratio_desc_2gram_In_name,Count_name_3gram_In_desc,Ratio_name_3gram_In_desc,Count_desc_3gram_In_name,Ratio_desc_3gram_In_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,7,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,4,4,...,2.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3,3,...,1.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma...",5,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...,5,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...,7,7,...,9.0,0.173077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.,4,4,...,2.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,I realized his pants are on backwards after th...,6,6,...,4.0,0.072727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
