## Takeaway name classifier implementation


In [238]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [239]:
from fastai import *
from fastai.text import *
import string
from unidecode import unidecode


## Consumer the PHE data and label

Load the file and the model, apply the model and store the output

In [240]:
df_phe = pd.read_csv('30_03_2020_PHE_method.csv')

In [241]:
df = pd.read_csv('07_07_2020_final.csv')

valid_idx = df[df.valid].index
train_idx = df[~df.valid].index

bal_idx = []
for k, v in zip(df.index, df.bal):
    bal_idx += [k]*v



In [242]:
df2_phe = df_phe[["BusinessName"]]

In [243]:
df2_phe

Unnamed: 0,BusinessName
0,abc barbeque
1,abm - happy kitchen
2,adilias station road bakery
3,amt cappuccino bar
4,arbury court fish bar
5,arjuna ltd kitchen 1 & ground floor
6,baguette company
7,barracuda fish bar
8,bon appetit mayfair limited
9,boost juice bars


In [244]:
class LetterTokenizer(BaseTokenizer):
    "Character level tokenizer function."
    def __init__(self, lang): pass
    def tokenizer(self, t:str) -> List[str]:
        out = []
        i = 0
        while i < len(t):
            if t[i:].startswith(BOS):
                out.append(BOS)
                i += len(BOS)
            else:
                out.append(t[i])
                i += 1
        return out
            
    def add_special_cases(self, toks:Collection[str]): pass

In [245]:
itos = [UNK, BOS] + list(string.ascii_lowercase + " -'@&)(." +"0123456789")

In [246]:
vocab=Vocab(itos)
tokenizer=Tokenizer(LetterTokenizer, pre_rules=[], post_rules=[])

In [247]:
train_df = df.iloc[train_idx, [3,2]]
bal_df = df.iloc[bal_idx, [3,2]]
valid_df = df.iloc[valid_idx, [3,2]]

In [248]:
df_small = df2_phe[0:200]

## Classifier with Just Eat data

In [249]:
data = TextClasDataBunch.from_df(path='.', train_df=bal_df, valid_df=valid_df,
                         tokenizer=tokenizer, vocab=vocab,
                         mark_fields=False, bs=128)

In [250]:
data.add_test(df2_phe)

In [251]:
data

TextClasDataBunch;

Train: LabelList (40038 items)
x: TextList
xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a
y: CategoryList
Fast Food,Fast Food,Fast Food,Fast Food,Fast Food
Path: .;

Valid: LabelList (4000 items)
x: TextList
xxbos   o m a r   s h a r i f s,xxbos   c h a r g h a s   f a s t   f o o d,xxbos   r o b e r t o ' s   p i z z a,xxbos   p o p   i n,xxbos   j a s m i n e   i n n
y: CategoryList
South Asian,Fast Food,Pizza,Fish & Chips,Southeast & East Asian
Path: .;

Test: LabelList (65253 items)
x: TextList
xxbos   a b c   b a r b e q u e,xxbos   a b m   -   h a p p y   k i t c h e n,xxbos   a d i l i a s   s t a t i o n   r o a d   b a k e r y,xxb

In [252]:
learn = text_classifier_learner(data, AWD_LSTM, drop_mult=0.4, bptt=70)


In [253]:
learn.load('07_07_2020_final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (40038 items)
x: TextList
xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a,xxbos   s o u t h e r n   f r i e d   c h i c k e n     a n d     p i z z a
y: CategoryList
Fast Food,Fast Food,Fast Food,Fast Food,Fast Food
Path: .;

Valid: LabelList (4000 items)
x: TextList
xxbos   o m a r   s h a r i f s,xxbos   c h a r g h a s   f a s t   f o o d,xxbos   r o b e r t o ' s   p i z z a,xxbos   p o p   i n,xxbos   j a s m i n e   i n n
y: CategoryList
South Asian,Fast Food,Pizza,Fish & Chips,Southeast & East Asian
Path: .;

Test: LabelList (65253 items)
x: TextList
xxbos   a b c   b a r b e q u e,xxbos   a b m   -   h a p p y   k i t c h e n,xxbos   a d i l i a s   s t a t i o n   r o a d  

In [254]:
learn.predict(df2_phe.BusinessName[18])

(Category Southeast & East Asian,
 tensor(9),
 tensor([3.0248e-08, 7.4545e-10, 7.3720e-11, 5.8726e-33, 3.5318e-10, 7.4385e-11,
         3.0552e-08, 1.1761e-09, 1.6489e-07, 1.0000e+00]))

In [255]:
#txt_ci = TextClassificationInterpretation.from_learner(learn)
interp2 = TextClassificationInterpretation(learn, *learn.get_preds(ds_type = DatasetType.Test, ordered = True, with_loss = True))

In [256]:
preds = interp2.pred_class

In [257]:
probs = [i.numpy().max() for i in interp2.probs]

In [258]:
probs

[0.90618414,
 0.8066343,
 0.978884,
 0.9709519,
 0.9940362,
 0.36029842,
 0.9999646,
 0.9977162,
 0.5591927,
 0.63854796,
 0.95245385,
 0.67052406,
 0.999739,
 0.999739,
 0.77140063,
 0.79457814,
 0.99999833,
 0.99999976,
 0.99999976,
 0.93750024,
 0.87470937,
 0.9949708,
 0.9999994,
 0.9996081,
 0.6075147,
 0.99999976,
 0.90337515,
 0.8164276,
 0.9999918,
 0.9999918,
 0.9999918,
 0.32498372,
 0.9999969,
 0.9397374,
 0.611787,
 0.9999995,
 0.67263985,
 0.9038936,
 0.9781718,
 0.9865247,
 0.98953676,
 0.999509,
 0.96630424,
 0.41494775,
 0.5112004,
 0.8072738,
 0.9975032,
 0.99801874,
 0.86194766,
 0.4354783,
 0.9981231,
 0.9981231,
 0.98034954,
 0.83195794,
 0.37740615,
 0.9987237,
 0.8655512,
 0.8756727,
 0.9999707,
 0.9915308,
 0.9999796,
 0.9999889,
 0.9998423,
 0.43819976,
 0.8542738,
 0.97256666,
 0.94277346,
 0.9668719,
 0.9668719,
 0.9668719,
 0.85582733,
 0.96804374,
 0.5728723,
 0.99961615,
 0.8498602,
 0.57807,
 0.9971859,
 0.44072592,
 0.9910987,
 0.8793216,
 0.9917195,
 0.5

In [259]:
result = preds.numpy()

In [260]:
result

array([0, 0, 7, 6, ..., 8, 9, 8, 4])

In [261]:
labelled_preds = [learn.data.classes[i] for i in result]

In [262]:
labelled_preds

['Burger',
 'Burger',
 'Sand/Caf/Bake',
 'Pizza',
 'Fish & Chips',
 'South Asian',
 'Sand/Caf/Bake',
 'Fish & Chips',
 'Sand/Caf/Bake',
 'Chicken',
 'Sand/Caf/Bake',
 'Fish & Chips',
 'Sand/Caf/Bake',
 'Sand/Caf/Bake',
 'Chicken',
 'Southeast & East Asian',
 'Chicken',
 'Chicken',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Sand/Caf/Bake',
 'South Asian',
 'Kebab',
 'Desserts',
 'Fish & Chips',
 'Kebab',
 'Sand/Caf/Bake',
 'Fish & Chips',
 'Pizza',
 'Pizza',
 'Pizza',
 'Fish & Chips',
 'Southeast & East Asian',
 'Fish & Chips',
 'Sand/Caf/Bake',
 'Kebab',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'South Asian',
 'Southeast & East Asian',
 'Sand/Caf/Bake',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Desserts',
 'Fish & Chips',
 'South Asian',
 'Desserts',
 'South Asian',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Kebab',
 'Southeast & East Asian',
 'Southeast & East Asian',
 'Southeas

In [263]:
df2_phe['output'] = labelled_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [264]:
df_label = df2_phe

In [265]:
replacers = ["mcdonald", "burger king", "kfc", "papa john", "domino", "subway", "gregg" ]

In [266]:
chain_map = {"mcdonald":"Burger", "kfc":"Chicken"}
#chains2 = {r"\b{}\b".format(k): v for k, v in chains2.items()}

In [267]:
for item in replacers:
    df_label.loc[df_label['BusinessName'].str.contains(item), 'tempName'] = item

df_label["output"]=df_label["tempName"].map(chain_map).fillna(df_label['output'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying 

In [268]:
df_out = df_phe
df_out['output'] = df_label['output']
df_out['probs'] = probs

In [269]:
df_out

Unnamed: 0.1,Unnamed: 0,FHRSID,LocalAuthorityBusinessID,BusinessName,BusinessType,BusinessTypeID,AddressLine1,AddressLine2,AddressLine3,AddressLine4,...,scores.Structural,scores.ConfidenceInManagement,geocode.longitude,geocode.latitude,meta.extractDate,LocalAuthorityId,DownloadDate,LastPublishedDate,output,probs
0,9,506499,PI/000023107,abc barbeque,Takeaway/sandwich shop,7844,15 The Broadway,Mill Road,Cambridge,Cambridgeshire,...,15.0,10.0,0.143618,52.198028,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Burger,0.906184
1,10,1132343,PI/000096668,abm - happy kitchen,Takeaway/sandwich shop,7844,22 Cheddars Lane,Cambridge,Cambridgeshire,,...,,,0.145604,52.212226,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Burger,0.806634
2,20,987293,PI/000084185,adilias station road bakery,Takeaway/sandwich shop,7844,7 Station Road,Cambridge,Cambridgeshire,,...,5.0,5.0,0.131527,52.195190,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Sand/Caf/Bake,0.978884
3,43,506480,PI/000022130,amt cappuccino bar,Takeaway/sandwich shop,7844,Cambridge Railway Station,Station Road,Cambridge,Cambridgeshire,...,15.0,5.0,0.137284,52.194106,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Pizza,0.970952
4,57,505949,PI/000000955,arbury court fish bar,Takeaway/sandwich shop,7844,42 Arbury Court,Cambridge,Cambridgeshire,,...,5.0,5.0,0.128207,52.224635,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Fish & Chips,0.994036
5,65,507025,PI/000075216,arjuna ltd kitchen 1 & ground floor,Takeaway/sandwich shop,7844,12 Mill Road,Cambridge,Cambridgeshire,,...,5.0,0.0,0.133661,52.201248,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,South Asian,0.360298
6,90,741406,PI/000090505,baguette company,Takeaway/sandwich shop,7844,Addenbrookes,Hills Road,Cambridge,Cambridgeshire,...,5.0,0.0,,,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Sand/Caf/Bake,0.999965
7,105,1014387,PI/000096703,barracuda fish bar,Takeaway/sandwich shop,7844,39 Ditton Lane,Cambridge,Cambridgeshire,,...,5.0,5.0,0.168617,52.216176,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Fish & Chips,0.997716
8,138,799963,PI/000091698,bon appetit mayfair limited,Takeaway/sandwich shop,7844,Astrazeneca Uk Ltd Bon Appetit,Cambridge Biomedical Campus,Francis Crick Avenue,Cambridge Cambridgeshire,...,10.0,5.0,,,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Sand/Caf/Bake,0.559193
9,141,813277,PI/000091533,boost juice bars,Takeaway/sandwich shop,7844,Juice Bar,Lion Yard,Cambridge,Cambridgeshire,...,5.0,5.0,0.121312,52.204698,0001-01-01T00:00:00,1,2019-09-25,2019-09-25T00:30:05.643,Chicken,0.638548


In [270]:
df_out.to_csv('07_07_2020_phe_output.csv', index=False)

In [271]:
counts = df_out.groupby('output').BusinessName.count().sort_values(ascending=False)
counts

output
Southeast & East Asian    12789
Fish & Chips              10013
Pizza                      9843
Sand/Caf/Bake              8491
South Asian                8014
Burger                     5430
Chicken                    4266
Kebab                      3909
Desserts                   1281
Fast Food                  1217
Name: BusinessName, dtype: int64