In [1]:
import pandas as pd 
import numpy as np 
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import os
import cPickle as pk
import pdb
import math
from unidecode import unidecode
import re

In [2]:
#Gets the word to replace
def findidx(StemWords,word):
    A = []
    for key,value in StemWords.items():
        if word in value:
            return key
        else:
            continue
    return word

In [15]:
def flatten2one(xS):
    return [np.nan if isinstance(item,float) else item for sub_list in xS.tolist() for item in sub_list]

In [4]:
#Replace words with their stemmed counterparts given by the StemWords (a dictionary of words and their stem)
def stemmer(x,stemdic):
    if isinstance(x,float):
        return np.nan
    tokens = x.replace('[^\w\s]','').strip().split()
    for idx,word in enumerate(tokens):
        tokens[idx] = findidx(stemdic.loc[stemdic.index.str.startswith(word[0])],word)
    return tokens    

In [5]:
#Gets stop words that are to be removed (words such as is, and, the, etc.)
def dumpstopwords():
    eng_stop_words = set(stopwords.words("english")+['test','1234','various','addresses'])
    fr_stop_words = set(stopwords.words("french"))
    ger_stop_words = set(stopwords.words("german"))
    stop_words = eng_stop_words.union(fr_stop_words).union(ger_stop_words)
    pk.dump(stop_words,open("stop_words.pk","wb"))

In [6]:
#Get 1 or more columns that contains a particular name
def get_relevcols(df, colname):
	return [col for col in df.columns.values if colname in col.lower()]

In [7]:
#Generates stop words and the stemming dictionary
def loadfilters():
    stop_words = pk.load(open("stop_words.pk","rb"))
    stemingdic = pd.read_csv("stemming_dict.csv",header=None,index_col=False)
    stemingdic.index = stemingdic[0]
    stemingdic = stemingdic.iloc[:,1]
    return (stop_words,stemingdic)

In [8]:
#Removes punctuation 
def gramclean(x,stop_words):
    if pd.isnull(x):
        return np.nan
    return ' '.join([word for word in re.sub(r"[^\w\s]","",x).split() if word not in stop_words])

In [14]:
#words that have an extra 's' after like inns,hotels is stemmed to inn,hotel
def preclean(dat):
    cln_d = set(flatten2one(dat))
    
    def snip(x,cln):
        if len(x)<1 or isinstance(x,float):
            return
        for idx,w in enumerate(x):
            if len(w)<=2 or w.isdigit():
                continue
            if w[:-1] in cln:
                x[idx]=w[:-1]

    dat.apply(snip,args=(cln_d,))
    return dat

In [10]:
#Translitterate unicode to ASCII (ñ to n)
def translittunicode(x):
    if isinstance(x,float):
        return x
    else:
        x = unicode(x)
        return unidecode(x)

In [27]:
def prepareem(prop_subdf,name):
    #Load the stop words and stemming dictionary to use
    stop_words,stemingdic = loadfilters()
    
    #Clean data and tokenize
    namedat=prop_subdf[get_relevcols(prop_subdf,'name')].iloc[:,0].str.lower()
    addrdat = prop_subdf[get_relevcols(prop_subdf,'address')].iloc[:,0].str.lower()
    namedat = namedat.apply(gramclean, args=(stop_words,))
    addrdat = addrdat.apply(gramclean, args=(stop_words,))
    
    #Transliterate non ASCII characters
    namedat = namedat.apply(translittunicode)
    addrdat = addrdat.apply(translittunicode)

    #Stem certain common words
    namedat = namedat.apply(stemmer, args=(stemingdic,))
    addrdat = addrdat.apply(stemmer, args=(stemingdic,))

    #Create a feature set that will be used for classification
    emfeatures = pd.DataFrame({"name": namedat,
                "address": addrdat,
                "city": prop_subdf[get_relevcols(prop_subdf,'city')].iloc[:,0].str.lower(),
                "latitude":prop_subdf[get_relevcols(prop_subdf,'latitude')].iloc[:,0],
                "longitude":prop_subdf[get_relevcols(prop_subdf,'longitude')].iloc[:,0],
                "mapped_to_ext_id":prop_subdf[get_relevcols(prop_subdf,"mapped_to_ext_id")].iloc[:,0]
                              })
    
    #Clean and transliterate city names as well
    emfeatures.city = emfeatures.city.apply(translittunicode)
    preclean(emfeatures.name)
    preclean(emfeatures.address)

    pk.dump(emfeatures,open("emfeatures_"+name+".pk","wb"))
    
    return emfeatures

Unnamed: 0,HotelId,Latitude,Longitude,Name,_id,address,category,category_code,chain,chain_code,...,country,country_code,email,location,mapped_to,mapped_to_ext_id,verification,verified,website,zipcode
2,547193,-0.186336,51.512418,Shaftesbury Hyde Park International Hotel,58d8b11c1755559c475ab7cd,Inverness Terrace 52-56,4 STARS,4EST,,,...,United Kingdom,UK,,"[51.512418, -0.186336]",239270.0,aji9algwwk,F,False,,W2 3LB
4,450723,51.49693,-0.03369,Odessa Wharf,58d8b11c1755559c475ab7ee,7 Odessa Street Rotherhithe,3 STARS,3EST,,,...,United Kingdom,UK,,"[-0.03369, 51.49693]",211570.0,24dm72pi1y,D,,,
5,443581,51.492724,-0.190573,Presidential Apartments Kensington,58d8b11c1755559c475ab80c,"6-12 Barkston Gardens, South Kensington",APARTMENT 3RD CATEGORY,AT3,,,...,United Kingdom,UK,RESERVATIONS@PRESIDENTIAL-KENSINGTON.COM,"[-0.1905733, 51.4927241]",322022.0,dk3lzcq01c,D,,www.presidentialapartmentslondon.com,SW5 0EN
11,189477,51.524276,-0.184641,The Colonnade,58d8b11c1755559c475ab92e,WARRINGTON CRESCENT 2,4 STARS,4EST,,,...,United Kingdom,UK,reservations@colonnadehotel.co.uk,"[-0.184641, 51.524276]",151406.0,aowl0c2jj3,D,,www.colonnadehotel.co.uk,W9 1ER
16,547192,51.527875,-0.089001,M by Montcalm Shoreditch London Tech City,58d8b11c1755559c475aba0d,151 - 157 City Road,4 STARS,4EST,,,...,United Kingdom,UK,,"[-0.0890010613423, 51.527875222]",480679.0,7hwlrghwlj,,,,EC1V 1JS
23,547200,-0.28315,51.537919,Travelodge London Wembley Hotel,58d8b11c1755559c475aba57,"North Circular Road, Ealing",2 STARS,2EST,,,...,United Kingdom,UK,,"[51.537919, -0.28315]",622235.0,am1cci2vas,,,,NW10 7UG
25,547196,-0.085397,51.502567,Smart City Apartments London Bridge,58d8b11c1755559c475aba98,54 Weston Street,4 STARS,4EST,,,...,United Kingdom,UK,,"[51.502567, -0.085397]",570605.0,t8xj9acf86,,,,SE1 3QJ
