# Snorkel - Weakly Supervised NER on Restaurant Dataset

**Importing necessary modules**

In [1]:
import pandas as pd
import numpy as np
import re
import editdistance as ed
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
import nltk
nltk.download('wordnet')
from nltk import word_tokenize
nltk.download('punkt')
from nltk.corpus import wordnet 
import spacy
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from snorkel.labeling import PandasLFApplier, LFApplier, LFAnalysis, labeling_function
from snorkel.analysis import get_label_buckets
from sklearn.metrics import confusion_matrix
from snorkel.labeling import LabelModel
from snorkel.labeling import MajorityLabelVoter

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I517193\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\I517193\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Schema
* The data consist of the queries users searched for, by voice, converted to text
* All queries are labeled with their tags (each unigram is labeled with the entity name)
* The entities covered are as follows:
  * Restuarant_name
  * Rating
  * Amenity
  * Location
  * Price
  * Hours
  * Dish
  * Cuisine

**Preprocessing the file**

* Separating each candidate noun phrase from the query

In [2]:
def ret_df_file(file):
    with open(file, "r") as f:
        lines = f.readlines()
        size = len(lines) 
        idx_list = [idx + 1 for idx, val in
                enumerate(lines) if val == '\n'] 
        res = [lines[i: j-1] for i, j in
            zip([0] + idx_list, idx_list + 
            ([size] if idx_list[-1] != size else []))] 
        queries = ['' for i in range(len(res))]
        tags = []
        unigrams = []
        words = []
        poss = []
        for i, ele in enumerate(res):
            flag = False
            tempWords = []
            temT = []
            temW = []
            temPos = []
            for j, actual_str in enumerate(ele):
                res[i][j] = actual_str.split('\t')
                res[i][j][-1] = res[i][j][-1].replace('\n', '')
                if res[i][j][0][:2]=="B-" or res[i][j][0][:2]=="I-":
                    if flag:
                        if tempTag == res[i][j][0][2:]:
                            tempWords.append(res[i][j][-1])
                        else:
                            temT.append(tempTag)
                            temW.append(' '.join(tempWords))
                            pos[-1] = j-1
                            temPos.append(pos)
                            tempWords = []
                            pos = [j,j]
                            tempWords.append(res[i][j][-1])
                            tempTag = res[i][j][0][2:]
                        # tempTag.append
                    else:
                        tempWords = []
                        tempWords.append(res[i][j][-1])
                        pos = [j,j]
                        tempTag = res[i][j][0][2:]
                        flag = True
                else:
                    if flag:
                        temT.append(tempTag)
                        temW.append(' '.join(tempWords))
                        pos[-1] = j-1
                        temPos.append(pos)
                        flag = False
                if j!=0:
                    queries[i] = queries[i] + ' ' + res[i][j][-1]
                else:
                    queries[i] = res[i][j][-1]
            if flag:
                temT.append(tempTag)
                temW.append(' '.join(tempWords))
                pos[-1] = j
                temPos.append(pos)
                flag = False
            tags.append(temT)
            words.append(temW)
            poss.append(temPos)
    df_queries = []
    df_pos_start = []
    df_pos_end = []
    n_grams = []
    df_tags = []
    for i, ele in enumerate(res):
        for j in range(len(tags[i])):
            df_queries.append(queries[i])
            df_tags.append(tags[i][j])
            df_pos_start.append(poss[i][j][0])
            df_pos_end.append(poss[i][j][1])
            n_grams.append(words[i][j])
    return pd.DataFrame({"n_grams": n_grams, "Tag": df_tags, "Query": df_queries, "Pos_start": df_pos_start, "Pos_end":df_pos_end})

**Getting the train and test data**

In [3]:
df_train = ret_df_file("../Datasets/MITRestuarant/restauranttrain.bio")
df_test = ret_df_file("../Datasets/MITRestuarant/restauranttest.bio")

In [4]:
print(df_train.shape)
print(df_test.shape)

(15363, 5)
(3151, 5)


In [5]:
df_train.head(5)

Unnamed: 0,n_grams,Tag,Query,Pos_start,Pos_end
0,2 start,Rating,2 start restaurants with inside dining,0,1
1,inside dining,Amenity,2 start restaurants with inside dining,4,5
2,5 star,Rating,5 star resturants in my town,0,1
3,in my town,Location,5 star resturants in my town,3,5
4,hong kong,Restaurant_Name,98 hong kong restaurant reasonable prices,1,2


In [6]:
df_test.head(5)

Unnamed: 0,n_grams,Tag,Query,Pos_start,Pos_end
0,four star,Rating,a four star restaurant with a bar,1,2
1,with a,Location,a four star restaurant with a bar,4,5
2,bar,Amenity,a four star restaurant with a bar,6,6
3,asian,Cuisine,any asian cuisine around,1,1
4,around,Location,any asian cuisine around,3,3


**Now I extract candidate noun phrases from the queries using nlp module's noun chunks**

Also, to label the noun phrases, I am using the following assumptions:
* The labeled noun phrase and the noun phrase extracted using nlp should have an intersection of more than one word
* The absolute value of the difference in the number of words in the labelled and the extracted noun phrase should not be more than 2 words

If both the above criteria are satisfied, then I label the noun phrase given by the nlp module using the label from the noun phrase already labelled.

*Note: I need to do this as using the same noun phrase manually labelled would not be available at test time, and we should train the model in a similar way*

In [7]:
def obtain_noun_chunks_limited(df_train):
    list_of_queries = df_train.Query.unique()
    queries_train = []
    tags_train = []
    phrases_train = []
    pos_S = []
    pos_E = []
    nlp = spacy.load("en_core_web_sm")
    for query in list_of_queries:
        doc = nlp(query)
        positions_start = list(df_train.Pos_start[df_train.Query==query])
        positions_end = list(df_train.Pos_end[df_train.Query==query])
        for noun in doc.noun_chunks:
            flag = False
            for i in range(len(positions_start)):
                # checking intersection>0 and extra words in noun_chunk not more than 2 and difference 
                # between the phrases of not more than 2 words
                if len(set(list(range(noun.start, noun.end))) & \
                       set(list(range(positions_start[i], positions_end[i]+1))))>0 \
                and (len(set(list(range(noun.start, noun.end))).difference(set(list(range(positions_start[i], \
                                                                                            positions_end[i]+1)))))<=2 \
                    and len(set(list(range(positions_start[i], positions_end[i]+1))).difference(set(list(range(noun.start, noun.end)))))<=2):
                    queries_train.append(query)
                    tags_train.append(list(df_train.Tag[np.logical_and(df_train.Query==query, \
                                                                  df_train.Pos_start==positions_start[i])])[0])
                    phrases_train.append(noun.text)
                    pos_S.append(noun.start)
                    pos_E.append(noun.end-1)
                    flag = True
                    break
    return pd.DataFrame({"n_grams": phrases_train, "Tag": tags_train, \
                                "Query": queries_train, "Pos_start": pos_S, "Pos_end":pos_E})

In [139]:
df_train_limited_chunks = obtain_noun_chunks_limited(df_train)

In [140]:
df_valid_limited_chunks = obtain_noun_chunks_limited(df_test)

In [141]:
df_valid_limited_chunks.head()

Unnamed: 0,n_grams,Tag,Query,Pos_start,Pos_end
0,a four star restaurant,Rating,a four star restaurant with a bar,0,3
1,a bar,Location,a four star restaurant with a bar,5,6
2,any asian cuisine,Cuisine,any asian cuisine around,0,2
3,any bbq places,Cuisine,any bbq places open before 5 nearby,0,2
4,any dancing establishments,Location,any dancing establishments with reasonable pri...,0,2


**There are the following types of entities available in the data**

In [142]:
df_train_limited_chunks.Tag.unique()

array(['Rating', 'Amenity', 'Location', 'Hours', 'Dish', 'Cuisine',
       'Price', 'Restaurant_Name'], dtype=object)

**Defining the Labels**

Now we assign numeric labels to the entities and then change them in the dataset

In [143]:
# RATING = 0
AMENITY = 0
LOCATION = 1
# RESTUARANT_NAME = 3
HOURS = 2
# DISH = 5
CUISINE = 3
PRICE = 4
ABSTAIN  = -1

In [144]:
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Rating'] = RATING
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Amenity'] = AMENITY
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Location'] = LOCATION
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Restaurant_Name'] = RESTUARANT_NAME
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Hours'] = HOURS
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Dish'] = DISH
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Cuisine'] = CUISINE
df_train_limited_chunks.Tag[df_train_limited_chunks.Tag=='Price'] = PRICE

## Writing Labelling functions

**Labelling functions for Rating**

In [145]:
@labeling_function()
def lf_rating_star(x):
    # Returns a label of rating if pattern of digit star's found in the phrase
    n_grams = x.n_grams.lower()
    if re.findall("[\d]+[\W]?star", n_grams):
        return RATING
    else:
        return ABSTAIN

In [146]:
@labeling_function()
def lf_rating_adj(x):
    # Returns a label of rating if any adjective is within one edit distance or the base form of adjective is found in the phrase
    adjectives = ['fresh', 'recommended', 'fair', 'recommendation', 'rated'
       'terrific', 'well', 'awesome', 'awards', 'authentic', 'pleasing', 'favorite',
        'lovely', 'really', 'decent', 'busy', 'superior','average'
       'famous', 'poor', 'decent', 'simple', 'real', 'popular', 'wonderful',
       'casual', 'perfect', 'massive', 'nearby', 'nice',
       'delicious', 'winning', 'favourite',  'negative', 'award', 'positive', 'fancy',
       'outstanding', 'good', 'amazing', 'recommend',
       'strong', 'value','incredible',
       'fantastic', 'classy', 'top', 'former', 'rated','reviews','horrible','terrible', 
       'local', 'excellent', 'place',
       'tasty', 'rate', 'high', 'great']
    lemmatiser = WordNetLemmatizer()
    for word in x.n_grams.lower().split():
        for adjective in adjectives:
            # ed.eval(adjective, word)<=1 or \
            if lemmatiser.lemmatize(word, pos = "a")==adjective:
                return RATING
    return ABSTAIN

In [147]:
@labeling_function()
def lf_rating_adj_syn(x):
    # Returns a label of rating if any word in the phrase is a synonym or an antonym within one edit distance of the mentioned words
    adjectives = ['fresh', 'recommended', 'fair', 'recommendation', 'rated'
       'terrific', 'well', 'awesome', 'awards', 'authentic', 'pleasing', 'favorite',
        'lovely', 'really', 'decent', 'busy', 'superior',
       'famous', 'poor', 'decent', 'simple', 'real', 'popular', 'wonderful',
       'casual', 'perfect', 'massive', 'nearby', 'nice',
       'delicious', 'winning', 'favourite',  'negative', 'award', 'positive', 'fancy',
       'outstanding', 'good', 'amazing', 'recommend',
       'strong', 'value','incredible',
       'fantastic', 'classy', 'top', 'former', 'rated','reviews','horrible','terrible', 
       'local', 'excellent', 'place',
       'tasty', 'rate', 'high', 'great']
    synonyms = [] 
    antonyms = [] 
    for adj in adjectives:
        for syn in wordnet.synsets(adj): 
            for l in syn.lemmas(): 
                synonyms.append(l.name()) 
#                 if l.antonyms(): 
#                     antonyms.append(l.antonyms()[0].name()) 
    
    lemmatiser = WordNetLemmatizer()
    for word in x.n_grams.lower().split():
        for adjective in synonyms:
            #ed.eval(adjective, word)<=1 or \
            if lemmatiser.lemmatize(word, pos = "a")==adjective:
                return RATING
#         for adjective in antonyms:
#             #ed.eval(adjective, word)<=1 or \
#             if lemmatiser.lemmatize(word, pos = "a")==adjective:
#                 return RATING
    return ABSTAIN

**Labelling functions for Amenity**

In [148]:
@labeling_function()
def lf_amenity_lem(x):
    # Returns a label of amenity if any word is within one edit distance of the mentioned adjectives or it's lemmatized form
    amenities = ['formal', 'outdoor', 'friendly', 'parking', 'special'
       'quiet', 'smoking', 'atmosphere', 'anniversary', 'birthday', 'tourist', 'reservation']
    lemmatiser = WordNetLemmatizer()
    for word in x.n_grams.lower().split():
        for adjective in amenities:
            # ed.eval(adjective, word)<=1 or 
            if lemmatiser.lemmatize(word, pos = "a")==adjective:
                return AMENITY
    return ABSTAIN

In [149]:
@labeling_function()
def lf_amenity_syn(x):
    # Returns a label of amenity if any word in the phrase is a synonym or an antonym of the mentioned words within one edit distance
    amenities = ['formal', 'outdoor', 'friendly', 'parking', 'special'
       'quiet', 'smoking', 'atmosphere', 'anniversary', 'birthday', 'tourist', 'reservation']

    synonyms = [] 
    antonyms = [] 
    for adj in amenities:
        for syn in wordnet.synsets(adj): 
            for l in syn.lemmas(): 
                synonyms.append(l.name()) 
                if l.antonyms(): 
                    antonyms.append(l.antonyms()[0].name()) 
    
    lemmatiser = WordNetLemmatizer()
    for word in x.n_grams.lower().split():
        for adjective in synonyms:
            # ed.eval(adjective, word)<=1 or 
            if lemmatiser.lemmatize(word, pos = "a")==adjective:
                return AMENITY
#         for adjective in antonyms:
#             #ed.eval(adjective, word)<=1 or 
#             if lemmatiser.lemmatize(word, pos = "a")==adjective:
#                 return AMENITY
    return ABSTAIN

**Labeling function for Location**

In [150]:
@labeling_function()
def lf_location_prep_before(x):
    # Returns a label of location if the word before the phrase is a preposition mentioned
    if (x.Pos_start>=1 and x.Query.split()[x.Pos_start - 1].lower() in ['in', 'near', 'above', 'over', 'by', 'along', 'around']) or \
    (x.Pos_start>1 and x.Query.split()[x.Pos_start - 2].lower() in  ['in', 'near', 'above', 'over', 'by', 'along', 'on']):
        return LOCATION
    else:
        return ABSTAIN

In [151]:
@labeling_function()
def lf_location_prep_after(x):
    # Returns a label of location if the word after the phrase is the word "where"
    if x.Pos_end + 1 <= (len(x.Query.strip().split()) - 1) \
    and x.Query.split()[x.Pos_end + 1].lower() in ['where']:
        return LOCATION
    else:
        return ABSTAIN

**Labeling function for Hours**

In [152]:
@labeling_function()
def lf_hours_keywords(x):
    # Returns a label of hours if the the phrase contains any of the words mentioned below
    keywords = ['hour', 'hours', 'am', 'a.m.', 'pm', 'p.m.', 'early', 'late']
    for word in x.n_grams.lower().split():
        if word in keywords:
            return HOURS
    return ABSTAIN

**Labeling function for Price**

In [153]:
@labeling_function()
def lf_price_keywords(x):
    # Returns a label of price if the the phrase contains any of the words mentioned below
    keywords = ['cheap', 'expensive', 'reasonable', 'bucks', 'dollars', 'price', 'prices', 'affordable', \
                'midpriced', 'high', 'least']
    for word in x.n_grams.lower().split():
        if word in keywords:
            return PRICE
    return ABSTAIN

**Labeling function for cuisine**

In [154]:
@labeling_function()
def lf_cuisine_keywords(x):
    # Returns a label of cuisine if the the phrase contains any of the words mentioned below
    cuisine_words_list = ['Ainu', 'Albanian', 'Argentine', 'Andhra', 'Anglo-Indian', 'Arab', 'Armenian', 'Assyrian', 'Awadhi', \
                'Azerbaijani', 'Balochi', 'Belarusian', 'Bangladeshi', 'Bengali', 'Berber', 'Brazilian', 'Buddhist', \
                'Bulgarian', 'Cajun', 'Cantonese', 'Caribbean', 'Chechen', 'Chinese', 'Chinese Islamic', 'Circassian', \
                'Crimean Tatar', 'Cypriot', 'Danish', 'English', 'Estonian', 'French', 'Filipino', 'Georgian', 'German', \
                'Goan', 'Goan Catholic', 'Greek', 'Gujarati', 'Hyderabad', 'Hong Kong Western', 'Indian', 'Indian Chinese', \
                'Indian Singaporean', 'Indonesian', 'Inuit', 'Irish', 'Italian American', 'Italian', 'Jamaican', 'Japanese', \
                'Jewish', 'Karnataka', 'Kazakh', 'Keralite', 'Korean', 'Kurdish', 'Laotian', 'Lebanese', 'Latvian', \
                'Lithuanian', 'Louisiana Creole', 'Maharashtrian', 'Mangalorean', 'Malay', 'Malaysian Chinese', \
                'Malaysian Indian', 'Mediterranean', 'Mexican', 'Mordovian', 'Mughal', 'Native American', 'Nepalese', \
                'New Mexican', 'Odia', 'Parsi', 'Pashtun', 'Polish', 'Pennsylvania Dutch', 'Pakistani', 'Peranakan', \
                'Persian', 'Peruvian', 'Portuguese', 'Punjabi', 'Rajasthani', 'Romanian', 'Russian', 'Sami', 'Serbian', \
                'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'South Indian', 'Soviet', 'Spanish', 'Sri Lankan', 'Taiwanese', \
                'Tatar', 'Thai', 'Turkish', 'Tamil', 'Udupi', 'Ukrainian', 'Vietnamese', 'Yamal', 'Zambian', 'Zanzibari', \
               'cuisine', 'cuisines']
    new_cuisine_list = []
    for word in cuisine_words_list:
        new_cuisine_list.append(word.lower())
    for word in x.n_grams.lower().split():
        if word in new_cuisine_list:
            return CUISINE
    return ABSTAIN

In [155]:
df_rating_amenity_train = df_train_limited_chunks[df_train_limited_chunks['Tag'].isin([0,1, 2, 3, 4, -1])]

In [156]:
df_train_lf_rating_amenity, df_test_lf_rating_amenity, y_lf_train, y_lf_test = train_test_split(df_rating_amenity_train.drop('Tag', axis = 1),\
                                                                                df_rating_amenity_train['Tag'], test_size = 0.3)

In [157]:
from snorkel.labeling import PandasLFApplier

lfs = [lf_amenity_lem, lf_amenity_syn, \
       lf_location_prep_before, lf_location_prep_after, lf_hours_keywords, lf_price_keywords, lf_cuisine_keywords]

In [158]:
df_test_lf_rating_amenity.head()

Unnamed: 0,n_grams,Query,Pos_start,Pos_end
7315,taco bueno phone number,what is taco bueno phone number,2,5
4587,a chinese restaurant,id like to find a chinese restaurant nearby th...,5,7
8242,the service,where can i find a good sauce the service does...,7,8
7604,this area,what place has the best reviews for dining in ...,9,10
4067,breakfast restaurants,i want a list of breakfast restaurants that se...,5,6


In [159]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train_lf_rating_amenity)
L_dev = applier.apply(df=df_test_lf_rating_amenity)

100%|█████████████████████████████████████████████████████████████████████████████| 6076/6076 [00:11<00:00, 532.19it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2604/2604 [00:04<00:00, 537.11it/s]


**Looking at the coverage of different labeling functions**

In [160]:
coverage_amenity_lem, coverage_amenity_syn, coverage_loc_before, \
coverage_loc_after, coverage_hours, coverage_price, coverage_cuisine = (L_train != ABSTAIN).mean(axis=0)
print(f"Labelling function stars coverage: {coverage_stars * 100:.1f}%")
# print(f"Labelling function adjectives coverage: {coverage_adjectives * 100:.1f}%")
# print(f"Labelling function synonyms of adjectives coverage: {coverage_synonyms * 100:.1f}%")
print(f"Labelling function lemmas of amenity coverage: {coverage_amenity_lem * 100:.1f}%")
print(f"Labelling function synonyms of amenity coverage: {coverage_amenity_syn * 100:.1f}%")
print(f"Labelling function synonyms of location_before_prep coverage: {coverage_loc_before * 100:.1f}%")
print(f"Labelling function synonyms of location_after_prep coverage: {coverage_loc_after * 100:.1f}%")
print(f"Labelling function synonyms of hours coverage: {coverage_hours * 100:.1f}%")
print(f"Labelling function synonyms of price coverage: {coverage_price * 100:.1f}%")
print(f"Labelling function synonyms of cuisine coverage: {coverage_cuisine * 100:.1f}%")

Labelling function stars coverage: 0.7%
Labelling function lemmas of amenity coverage: 4.8%
Labelling function synonyms of amenity coverage: 5.2%
Labelling function synonyms of location_before_prep coverage: 12.9%
Labelling function synonyms of location_after_prep coverage: 0.6%
Labelling function synonyms of hours coverage: 2.2%
Labelling function synonyms of price coverage: 3.6%
Labelling function synonyms of cuisine coverage: 9.1%


In [161]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_amenity_lem,0,[0],0.048387,0.048387,0.000658
lf_amenity_syn,1,[0],0.052172,0.050362,0.002633
lf_location_prep_before,2,[1],0.129197,0.003456,0.002962
lf_location_prep_after,3,[1],0.006419,0.00181,0.001317
lf_hours_keywords,4,[2],0.021725,0.000494,0.000494
lf_price_keywords,5,[4],0.036208,0.000823,0.000823
lf_cuisine_keywords,6,[3],0.091343,0.001646,0.001646


In [163]:
y_test_int = y_lf_test.astype('int32')

**Looking at the performance of different labelling functions**

In [164]:
LFAnalysis(L=L_dev, lfs=lfs)\
.lf_summary(y_test_int)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_amenity_lem,0,[0],0.041475,0.041475,0.001536,107,1,0.990741
lf_amenity_syn,1,[0],0.047235,0.043011,0.003072,115,8,0.934959
lf_location_prep_before,2,[1],0.132488,0.00576,0.004224,315,30,0.913043
lf_location_prep_after,3,[1],0.006528,0.003456,0.00192,5,12,0.294118
lf_hours_keywords,4,[2],0.020737,0.000768,0.000768,46,8,0.851852
lf_price_keywords,5,[4],0.031874,0.002304,0.002304,65,18,0.783133
lf_cuisine_keywords,6,[3],0.096774,0.00384,0.00384,236,16,0.936508


**Checking the total coverage of all the labelling functions**

In [165]:
# total coverage
((L_dev != -1).sum(1) > 0).sum() / len(L_dev)

0.326036866359447

**Checking the accuracy of the Majority Label voter model**

In [169]:
majority_model = MajorityLabelVoter(cardinality=5)
preds_train = majority_model.predict(L=L_train)

majority_acc = majority_model.score(L=L_dev, Y=y_test_int)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

Majority Vote Accuracy:   42.7%


**Checking the accuracy of the Label Model**

In [170]:
label_model = LabelModel(cardinality=5, verbose=True)
label_model.fit(L_train=L_train, n_epochs=3000, lr=1e-5)#, class_balance=[0.3, 0.3, 0.2, 0.2])

label_model_acc = label_model.score(L=L_dev, Y=y_test_int)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     42.7%


**Looking at the confusion matrix**

In [171]:
confusion_matrix(label_model.predict(L=L_dev), y_test_int)

array([[223,  76,  20, 140,   0],
       [137, 386,  18, 177,   7],
       [119,  75,  61, 133,   3],
       [112,  88,  12, 376,   7],
       [121,  72,  14, 160,  67]], dtype=int64)

In [172]:
train_labels = label_model.predict_proba(L_train).argmax(axis=1)
df_train_lf_rating_amenity['label'] = train_labels
df_train_lf_rating_amenity.head()

Unnamed: 0,n_grams,Query,Pos_start,Pos_end,label
958,french cuisine,can you find a restaurant near downtown that s...,9,10,3
1726,credit cards,does aiea manapua and snacks accept credit cards,6,7,0
2567,food,find me a good sports bar that serves food,8,8,0
1795,glen cove ny,does china palace in glen cove ny have a lunch...,4,6,1
628,any tourist type sightseeing restaurants,are there any tourist type sightseeing restaur...,2,6,0
