In [1]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Data Loading

In [3]:
# load data file into Python
train = pd.read_json("./data/train.json")
test = pd.read_json("./data/test.json") 

### Data Processing

In [4]:
# clean  
train['ingredients_clean_string'] = [' , '.join(z).strip() for z in train['ingredients']]  
test['ingredients_clean_string'] = [' , '.join(z).strip() for z in test['ingredients']]

# further clean data and extract information through word lemmatization
train['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                         for line in lists]).strip() for lists in train['ingredients']]       
test['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                          for line in lists]).strip() for lists in test['ingredients']]       


In [5]:
# create corpus based on newly processed data
train_corpus = train['ingredients_string']
test_corpus = test['ingredients_string']

# convert a collection of raw documents to a matrix of TF-IDF features
train_vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
test_vectorizer = TfidfVectorizer(stop_words='english')

# transform the corpus to a dense matrix representation
train_tfidf=train_vectorizer.fit_transform(train_corpus).todense()
test_tfidf=train_vectorizer.transform(test_corpus)


### Model Training

In [6]:
# perpare data for model training
train_predictor = train_tfidf
test_predictor = test_tfidf

train_target = train['cuisine']


classifier = LinearSVC(C=0.80, penalty="l2", dual=False)
model = LogisticRegression()

# grid search
parameters = {'C':[1, 10]}
classifier = GridSearchCV(model, parameters)


In [7]:
# fit classification model to data
classifier=classifier.fit(train_predictor,train_target)

# make prediction
prediction=classifier.predict(test_predictor)

# assign predicted values to cuisine in TEST set
test['cuisine'] = prediction

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [8]:
best=classifier.best_estimator_
print(best.coef_)

[[-3.96681074e-03 -1.96500697e-03 -3.57643629e-02 ... -1.15864196e-01
  -1.68836910e-01 -2.25961004e+00]
 [-5.04603762e-03 -5.50483015e-03 -5.17630919e-02 ... -1.64300327e-01
  -6.40440856e-02 -4.45287387e-01]
 [-3.75601616e-03 -2.73665985e-03  1.40479687e+00 ... -1.00266429e-01
  -2.09402317e-01 -2.61976572e+00]
 ...
 [-6.71387350e-03 -3.94347360e-02 -3.45328573e-02 ...  8.95791263e-01
  -3.08197680e-01 -6.04873971e-01]
 [-8.44411360e-03 -2.10091684e-03 -2.50133068e-02 ... -4.05085196e-02
  -4.32527271e-02  1.18280044e+00]
 [-3.77392017e-02 -2.40756679e-03 -1.11536642e-02 ... -4.61920112e-02
  -3.41478529e-02 -2.06352475e+00]]


In [9]:
print(best.classes_)

['brazilian' 'british' 'cajun_creole' 'chinese' 'filipino' 'french'
 'greek' 'indian' 'irish' 'italian' 'jamaican' 'japanese' 'korean'
 'mexican' 'moroccan' 'russian' 'southern_us' 'spanish' 'thai'
 'vietnamese']


### Result Analysis and Visualization

In [10]:
# create a dictionary to map cuisine to its respective index
import os 
if not os.path.exists('output/coefficients'):
    os.makedirs('output/coefficients')

#### Coefficient Bar Chart

In [12]:
for i, country in enumerate(best.classes_):
    line = go.Bar(x=train_vectorizer.get_feature_names(),y=best.coef_[i],text=best.coef_[i],textposition="outside",name="Number of users",marker_color="rgb(0,200,225)")
    fig = go.Figure(line)
    fig.update_traces(marker_line_color='rgb(0,0,255)',opacity=1)
    fig.update_layout(title=country,xaxis=dict(title="component"),yaxis=dict(title="value"))
    pio.write_image(fig, f"output/coefficients/{country}.png")
    if i == len(best.classes_)-1:
        fig.show()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



#### Signature Ingridents Pie Chart

In [16]:
dict_specialty={}
for i, country in enumerate(best.classes_):
    dict_specialty[country]={}
# if the coefficient is greater than 5, add to dictionary marked as characteristic of the cuisine, 
# if less than -5, add to dictionary marked as not characteristic of the cuisine
    for j in range(len(train_vectorizer.get_feature_names())):
        if best.coef_[i][j] > 5 or best.coef_[i][j] < -5:
            dict_specialty[country][train_vectorizer.get_feature_names()[j]]=best.coef_[i][j]


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [73]:
if not os.path.exists('output/signatures'):
    os.makedirs('output/signatures')

In [93]:
for country,ingredient in dict_specialty.items():
    # Add the pie chart
    labels, values = zip(*[(k, v) for k, v in ingredient.items() if v >= 5])
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
    fig.add_trace(go.Pie(labels=labels, values=values, name="ingredients"))
    # Update the pie chart to create a donut chart
    fig.update_traces(hole=.4, hoverinfo="label+percent+name")

    # Update layout to adjust title and annotations
    fig.update_layout(
        title_text="Signature Ingredients(coef>5) in " + country.title() + " Cuisine",
        width=500)
    

    # if there are ingredients with coefficient less than -5, add to the excluded list
    excluded = [(k,round(v,1)) for k, v in ingredient.items() if v < -5]
    if len(excluded) > 0:
        excluded_text = ", ".join([f"{k}" for k, v in excluded])
        fig.update_layout(annotations=[dict(text="Excluded Ingredients(<-5):" + excluded_text, x=-0.2, y=-0.15, font_size=10, showarrow=False),
        dict(text=country.upper(), x=0.5, y=0.5, font_size=15, showarrow=False)])

    else:
        fig.update_layout(annotations=[dict(text=country.upper(), x=0.5, y=0.5, font_size=15, showarrow=False)])
    
    pio.write_image(fig, f"output/signatures/{country}.png")

    if country == "vietnamese":
        fig.show()
