In [None]:
# importing required libraries

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# importing the dataset

file_path = '/content/drive/MyDrive/Cognifyz Internship/Dataset/Dataset.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


Data preprocessing

In [None]:
df = df.drop(['Restaurant ID', 'Country Code', 'Longitude', 'Latitude', 'Address', 'Locality', 'Price range', 'Locality Verbose', 'Currency', 'City', 'Average Cost for two', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text'], axis=1)
df.head()

Unnamed: 0,Restaurant Name,Cuisines,Aggregate rating,Votes
0,Le Petit Souffle,"French, Japanese, Desserts",4.8,314
1,Izakaya Kikufuji,Japanese,4.5,591
2,Heat - Edsa Shangri-La,"Seafood, Asian, Filipino, Indian",4.4,270
3,Ooma,"Japanese, Sushi",4.9,365
4,Sambo Kojin,"Japanese, Korean",4.8,229


In [None]:
(df.isna().sum()/df.shape[0])*100

Restaurant Name     0.000000
Cuisines            0.094231
Aggregate rating    0.000000
Votes               0.000000
dtype: float64

In [None]:
df['Cuisines']=df['Cuisines'].str.replace(" ","")
df['Cuisines']=df['Cuisines'].str.replace('-','NaN')
df['Cuisines']=df['Cuisines'].fillna('NaN')

In [None]:
(df.isna().sum()/df.shape[0])*100

Restaurant Name     0.0
Cuisines            0.0
Aggregate rating    0.0
Votes               0.0
dtype: float64

In [None]:
df['Cuisines'] = df['Cuisines'].apply(lambda x: x.split(','))

In [None]:
top_cuisines = df['Cuisines'].explode().value_counts().nlargest(25).index

df['Category'] = df['Cuisines'].apply(
    lambda cuisines: next((cuisine for cuisine in cuisines if cuisine in top_cuisines), 'Unclassified')
)
df.head(20)

Unnamed: 0,Restaurant Name,Cuisines,Aggregate rating,Votes,Category
0,Le Petit Souffle,"[French, Japanese, Desserts]",4.8,314,Japanese
1,Izakaya Kikufuji,[Japanese],4.5,591,Japanese
2,Heat - Edsa Shangri-La,"[Seafood, Asian, Filipino, Indian]",4.4,270,Seafood
3,Ooma,"[Japanese, Sushi]",4.9,365,Japanese
4,Sambo Kojin,"[Japanese, Korean]",4.8,229,Japanese
5,Din Tai Fung,[Chinese],4.4,336,Chinese
6,Buffet 101,"[Asian, European]",4.0,520,Asian
7,Vikings,"[Seafood, Filipino, Asian, European]",4.2,677,Seafood
8,Spiral - Sofitel Philippine Plaza Manila,"[European, Asian, Indian]",4.9,621,European
9,Locavore,[Filipino],4.8,532,Unclassified


In [None]:
mlb = MultiLabelBinarizer()
cuisine_matrix = pd.DataFrame(mlb.fit_transform(df['Cuisines']), columns=mlb.classes_)
df = pd.concat([df, cuisine_matrix], axis=1)

In [None]:
df.head()

Unnamed: 0,Restaurant Name,Cuisines,Aggregate rating,Votes,Category,Afghani,African,American,Andhra,Arabian,...,Teriyaki,TexNaNMex,Thai,Tibetan,Turkish,TurkishPizza,Vegetarian,Vietnamese,Western,WorldCuisine
0,Le Petit Souffle,"[French, Japanese, Desserts]",4.8,314,Japanese,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Izakaya Kikufuji,[Japanese],4.5,591,Japanese,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Heat - Edsa Shangri-La,"[Seafood, Asian, Filipino, Indian]",4.4,270,Seafood,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ooma,"[Japanese, Sushi]",4.9,365,Japanese,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Sambo Kojin,"[Japanese, Korean]",4.8,229,Japanese,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
top_cuisine_columns = [cuisine for cuisine in top_cuisines if cuisine in cuisine_matrix.columns]
X = cuisine_matrix[top_cuisine_columns]
y = df['Category']

Training, Testing and Evaluating the model performance using various classification algorithms

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression

model_LR = LogisticRegression()
model_LR.fit(X_train, y_train)
y_pred_LR = model_LR.predict(X_test)
f1_LR = f1_score(y_test, y_pred_LR, average='weighted')
print(f"F1 Score of Logistic Regression: {f1_LR}")
print("Accuracy:", accuracy_score(y_test, y_pred_LR))
print("Classification Report:\n", classification_report(y_test, y_pred_LR))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score of Logistic Regression: 0.8268658553876115
Accuracy: 0.8356881214024071
Classification Report:
               precision    recall  f1-score   support

    American       0.80      0.94      0.86        47
       Asian       0.86      0.92      0.89        13
      Bakery       0.90      0.96      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.75      0.41      0.53        22
        Cafe       0.91      0.99      0.95       128
     Chinese       0.81      0.67      0.73       173
 Continental       0.73      0.41      0.53        46
    Desserts       0.87      0.57      0.69        35
    European       0.92      0.79      0.85        14
    FastFood       0.82      0.70      0.76       152
 HealthyFood       0.77      0.91      0.83        11
    IceCream       0.75      0.97      0.85        31
     Italian       0.75      0.66      0.70        61
    Japanese       1.00      0

In [None]:
# One VS Rest Classification using Random Forest

classifier_RF = OneVsRestClassifier(RandomForestClassifier())
classifier_RF.fit(X_train, y_train)
y_pred_RF = classifier_RF.predict(X_test)
f1_RF = f1_score(y_test, y_pred_RF, average='weighted')
print(f"F1 Score of Random Forest Classifier: {f1_RF}")
print("Accuracy:", accuracy_score(y_test, y_pred_RF))
print("Classification Report:\n", classification_report(y_test, y_pred_RF))

F1 Score of Random Forest Classifier: 0.8226543939538543
Accuracy: 0.8288854003139717
Classification Report:
               precision    recall  f1-score   support

    American       0.84      0.87      0.85        47
       Asian       0.79      0.85      0.81        13
      Bakery       0.91      0.95      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.56      0.45      0.50        22
        Cafe       0.95      0.98      0.97       128
     Chinese       0.82      0.68      0.74       173
 Continental       0.71      0.54      0.62        46
    Desserts       0.85      0.63      0.72        35
    European       0.67      0.43      0.52        14
    FastFood       0.81      0.72      0.76       152
 HealthyFood       0.75      0.82      0.78        11
    IceCream       0.79      1.00      0.89        31
     Italian       0.72      0.69      0.71        61
    Japanese       0.93  

In [None]:
# K Nearest Neighbor Classification

knn_classifier = KNeighborsClassifier(n_neighbors=7)  # You can experiment with different values of 'n_neighbors'
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)
knn_f1_score = f1_score(y_test, knn_predictions, average='micro')  # Use 'weighted' for multi-class
print(f"KNN F1 Score: {knn_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_RF))
print("Classification Report:\n", classification_report(y_test, y_pred_RF))

KNN F1 Score: 0.8320251177394035
Accuracy: 0.8288854003139717
Classification Report:
               precision    recall  f1-score   support

    American       0.84      0.87      0.85        47
       Asian       0.79      0.85      0.81        13
      Bakery       0.91      0.95      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.56      0.45      0.50        22
        Cafe       0.95      0.98      0.97       128
     Chinese       0.82      0.68      0.74       173
 Continental       0.71      0.54      0.62        46
    Desserts       0.85      0.63      0.72        35
    European       0.67      0.43      0.52        14
    FastFood       0.81      0.72      0.76       152
 HealthyFood       0.75      0.82      0.78        11
    IceCream       0.79      1.00      0.89        31
     Italian       0.72      0.69      0.71        61
    Japanese       0.93      0.74      0.82      

In [None]:
# Decision Tree Classification

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_f1_score = f1_score(y_test, dt_predictions, average='weighted')
print(f"Decision Tree F1 Score: {dt_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_RF))
print("Classification Report:\n", classification_report(y_test, y_pred_RF))

Decision Tree F1 Score: 0.8193812695349032
Accuracy: 0.8288854003139717
Classification Report:
               precision    recall  f1-score   support

    American       0.84      0.87      0.85        47
       Asian       0.79      0.85      0.81        13
      Bakery       0.91      0.95      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.56      0.45      0.50        22
        Cafe       0.95      0.98      0.97       128
     Chinese       0.82      0.68      0.74       173
 Continental       0.71      0.54      0.62        46
    Desserts       0.85      0.63      0.72        35
    European       0.67      0.43      0.52        14
    FastFood       0.81      0.72      0.76       152
 HealthyFood       0.75      0.82      0.78        11
    IceCream       0.79      1.00      0.89        31
     Italian       0.72      0.69      0.71        61
    Japanese       0.93      0.74      

In [None]:
# Random Forest Classification

model_rfc = RandomForestClassifier(n_estimators=15, random_state=42)
model_rfc.fit(X_train, y_train)
y_pred_rfc = model_rfc.predict(X_test)
rfc_f1_score = f1_score(y_test, y_pred_rfc, average='weighted')
print(f"Random Forest Classifier F1 Score: {rfc_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Classification Report:\n", classification_report(y_test, y_pred_rfc))

Random Forest Classifier F1 Score: 0.8217009857132934
Accuracy: 0.8283621140763998
Classification Report:
               precision    recall  f1-score   support

    American       0.83      0.91      0.87        47
       Asian       0.79      0.85      0.81        13
      Bakery       0.91      0.95      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.69      0.41      0.51        22
        Cafe       0.95      0.98      0.97       128
     Chinese       0.82      0.68      0.74       173
 Continental       0.71      0.52      0.60        46
    Desserts       0.85      0.63      0.72        35
    European       0.75      0.43      0.55        14
    FastFood       0.84      0.69      0.76       152
 HealthyFood       0.73      1.00      0.85        11
    IceCream       0.79      0.97      0.87        31
     Italian       0.68      0.75      0.71        61
    Japanese       0.93     

In [None]:
# Support Vector Classification

model_SVC = SVC(kernel='rbf', C=1, random_state=42)
model_SVC.fit(X_train, y_train)
y_pred_SVC = model_SVC.predict(X_test)
svc_f1_score = f1_score(y_test, y_pred_SVC, average='weighted')
print(f"Support Vector Classifier F1 Score: {svc_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_SVC))
print("Classification Report:\n", classification_report(y_test, y_pred_SVC))

Support Vector Classifier F1 Score: 0.8309456338885987
Accuracy: 0.8383045525902669
Classification Report:
               precision    recall  f1-score   support

    American       0.80      0.94      0.86        47
       Asian       0.85      0.85      0.85        13
      Bakery       0.89      0.97      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.82      0.41      0.55        22
        Cafe       0.93      1.00      0.97       128
     Chinese       0.82      0.67      0.74       173
 Continental       0.81      0.54      0.65        46
    Desserts       0.86      0.69      0.76        35
    European       0.71      0.71      0.71        14
    FastFood       0.85      0.69      0.76       152
 HealthyFood       0.71      0.91      0.80        11
    IceCream       0.79      0.97      0.87        31
     Italian       0.81      0.75      0.78        61
    Japanese       1.00    

In [None]:
# Naive Bayes Classification

model_NB = MultinomialNB()
model_NB.fit(X_train, y_train)
y_pred_NB = model_NB.predict(X_test)
nb_f1_score = f1_score(y_test, y_pred_NB, average='weighted')  # Use 'weighted' for multi-class
print(f"Support Vector Classifier F1 Score: {nb_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_NB))
print("Classification Report:\n", classification_report(y_test, y_pred_NB))

Support Vector Classifier F1 Score: 0.6664172289219912
Accuracy: 0.706436420722135
Classification Report:
               precision    recall  f1-score   support

    American       0.72      0.83      0.77        47
       Asian       0.67      0.77      0.71        13
      Bakery       0.78      0.96      0.86       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.62      0.90      0.73        20
      Burger       0.70      0.32      0.44        22
        Cafe       0.92      0.95      0.93       128
     Chinese       0.67      0.29      0.41       173
 Continental       0.44      0.43      0.44        46
    Desserts       0.80      0.23      0.36        35
    European       0.83      0.71      0.77        14
    FastFood       0.85      0.70      0.77       152
 HealthyFood       0.43      0.27      0.33        11
    IceCream       0.75      0.97      0.85        31
     Italian       0.66      0.57      0.61        61
    Japanese       0.88     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Extreme Gradient Boosting Classification

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
from xgboost import XGBClassifier
model_XGB = XGBClassifier(random_state=42)
model_XGB.fit(X_train, y_train_encoded)
y_pred_encoded = model_XGB.predict(X_test)
y_pred_XGB = label_encoder.inverse_transform(y_pred_encoded)
xgb_f1_score = f1_score(y_test, y_pred_XGB, average='weighted')  # Use 'weighted' for multi-class
print(f"Support Vector Classifier F1 Score: {xgb_f1_score}")
print("Accuracy:", accuracy_score(y_test, y_pred_XGB))
print("Classification Report:\n", classification_report(y_test, y_pred_XGB))

Support Vector Classifier F1 Score: 0.8196244981972901
Accuracy: 0.82574568288854
Classification Report:
               precision    recall  f1-score   support

    American       0.83      0.83      0.83        47
       Asian       0.61      0.85      0.71        13
      Bakery       0.90      0.96      0.93       112
   Beverages       1.00      0.71      0.83        21
     Biryani       0.69      0.90      0.78        20
      Burger       0.59      0.45      0.51        22
        Cafe       0.95      0.96      0.96       128
     Chinese       0.82      0.66      0.73       173
 Continental       0.74      0.57      0.64        46
    Desserts       0.88      0.63      0.73        35
    European       0.46      0.43      0.44        14
    FastFood       0.81      0.71      0.76       152
 HealthyFood       0.71      0.91      0.80        11
    IceCream       0.79      1.00      0.89        31
     Italian       0.75      0.69      0.72        61
    Japanese       0.88      

Showing the model output

In [None]:
pd.DataFrame({"Model Names": ['OneVSRest Classifier', 'Logistic Regression', 'Random Forest Classifier', 'KNN Classifier', 'DT Classifier',
'Support Vector Classifier', 'XGB Classifier', 'Naive Bayes Classifier'], "Accuracy score": [f1_RF*100, f1_LR*100, rfc_f1_score*100, knn_f1_score*100, dt_f1_score*100, svc_f1_score*100, xgb_f1_score*100, nb_f1_score*100]}).sort_values(by="Accuracy score", ascending = False)

Unnamed: 0,Model Names,Accuracy score
3,KNN Classifier,83.202512
5,Support Vector Classifier,83.094563
1,Logistic Regression,82.686586
0,OneVSRest Classifier,82.265439
2,Random Forest Classifier,82.170099
6,XGB Classifier,81.96245
4,DT Classifier,81.938127
7,Naive Bayes Classifier,66.641723


In [None]:
sample=pd.DataFrame({"Actual Cuisine":y_test, "Predicted Cuisine":knn_predictions})
sample

Unnamed: 0,Actual Cuisine,Predicted Cuisine
4731,NorthIndian,NorthIndian
1468,Mughlai,Mughlai
9037,NorthIndian,NorthIndian
7866,FastFood,FastFood
5570,NorthIndian,NorthIndian
...,...,...
8149,NorthIndian,NorthIndian
5849,NorthIndian,NorthIndian
9019,Chinese,Chinese
742,Continental,Continental


In [None]:
sample.to_csv("Prediction of Cuisine Classification Model.csv",index=False)

In [None]:
from google.colab import files
files.download("Prediction of Cuisine Classification Model.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_df = pd.DataFrame({'Restaurant': X_test.index, 'Predicted Cuisine': knn_predictions})
merged_df = pd.merge(results_df, df, left_on='Restaurant', right_index=True)
merged_df

Unnamed: 0,Restaurant,Predicted Cuisine,Restaurant Name,Cuisines,Aggregate rating,Votes,Category,Afghani,African,American,...,Teriyaki,TexNaNMex,Thai,Tibetan,Turkish,TurkishPizza,Vegetarian,Vietnamese,Western,WorldCuisine
0,4731,NorthIndian,Wah Ji Wah,[NorthIndian],2.1,54,NorthIndian,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1468,Mughlai,19 Flavours Biryani,"[Mughlai, Hyderabadi]",4.1,84,Mughlai,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9037,NorthIndian,Andaaz E Paranthas,"[NorthIndian, Mughlai]",3.2,36,NorthIndian,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7866,FastFood,Tony's,[FastFood],4.4,163,FastFood,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5570,NorthIndian,Yummy Adda,"[NorthIndian, Mughlai]",3.5,14,NorthIndian,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1906,8149,NorthIndian,Saleem's Takeaway,"[NorthIndian, Mughlai, Chinese]",2.6,44,NorthIndian,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1907,5849,NorthIndian,Paras Chicken Point,[NorthIndian],0.0,1,NorthIndian,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1908,9019,Chinese,Hungry's Hut,"[Chinese, FastFood]",3.1,10,Chinese,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1909,742,Continental,Communiti,[Continental],4.2,334,Continental,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
merged_df.to_csv("Cuisines And Restaurants.csv",index=False)

In [None]:
from google.colab import files
files.download("Cuisines And Restaurants.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>