In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


In [34]:
# Load dataset
df = pd.DataFrame({'Restaurant_name': ["ABC", "DEF", "GHI", "JKL"],
                   'Rating': [4.3, 4.7, 3.4, 5.0],
                   'Category': ['Breakfast', 'Salads', 'Soups', 'Second Meal'],
                   'Dish_name': ['Scrambled eggs', 'Salad cucumbers with garlic', 
                                 'Chicken noodle soup', 'Pepper Roast (Trout Fish Fillet)'],
                   'Description': ['Scrambled eggs served with greens...', 
                                   'Cucumber, garlic', 'Light chicken soup...', 
                                   'Juicy pieces of trout...'],
                   'Price': [3.50, 5.50, 3, 12.50]
                   })
df.head()

Unnamed: 0,Restaurant_name,Rating,Category,Dish_name,Description,Price
0,ABC,4.3,Breakfast,Scrambled eggs,Scrambled eggs served with greens...,3.5
1,DEF,4.7,Salads,Salad cucumbers with garlic,"Cucumber, garlic",5.5
2,GHI,3.4,Soups,Chicken noodle soup,Light chicken soup...,3.0
3,JKL,5.0,Second Meal,Pepper Roast (Trout Fish Fillet),Juicy pieces of trout...,12.5


In [35]:
df = df.drop(["Restaurant_name", "Rating"], axis=1)
print(df)

      Category                         Dish_name  \
0    Breakfast                    Scrambled eggs   
1       Salads       Salad cucumbers with garlic   
2        Soups               Chicken noodle soup   
3  Second Meal  Pepper Roast (Trout Fish Fillet)   

                            Description  Price  
0  Scrambled eggs served with greens...    3.5  
1                      Cucumber, garlic    5.5  
2                 Light chicken soup...    3.0  
3              Juicy pieces of trout...   12.5  


In [37]:
# define one hot encoding
encoder = OneHotEncoder(sparse=False)

# transform data
onehot_columns = encoder.fit_transform(df[['Category']])
print("Encoded Matrix:\n", onehot_columns)

Encoded Matrix:
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]


In [38]:
tf1 = TfidfVectorizer(ngram_range=(1, 1), lowercase=True, stop_words='english')
df_dish_name = tf1.fit_transform(df['Dish_name'])
df_dish_name = pd.DataFrame(data=df_dish_name.toarray(), columns=tf1.get_feature_names_out())

tf2 = TfidfVectorizer(ngram_range=(1, 1), lowercase=True, stop_words='english')
df_description = tf2.fit_transform(df['Description'])
df_dscription = pd.DataFrame(data=df_description.toarray(), columns=tf2.get_feature_names_out())

In [39]:
print("TF-IDF Features for Dish Names:")
print(df_dish_name)

print("\nTF-IDF Features for Descriptions:")
print(df_dscription) 

TF-IDF Features for Dish Names:
   chicken  cucumbers      eggs    fillet      fish   garlic   noodle  \
0  0.00000    0.00000  0.707107  0.000000  0.000000  0.00000  0.00000   
1  0.00000    0.57735  0.000000  0.000000  0.000000  0.57735  0.00000   
2  0.57735    0.00000  0.000000  0.000000  0.000000  0.00000  0.57735   
3  0.00000    0.00000  0.000000  0.447214  0.447214  0.00000  0.00000   

     pepper     roast    salad  scrambled     soup     trout  
0  0.000000  0.000000  0.00000   0.707107  0.00000  0.000000  
1  0.000000  0.000000  0.57735   0.000000  0.00000  0.000000  
2  0.000000  0.000000  0.00000   0.000000  0.57735  0.000000  
3  0.447214  0.447214  0.00000   0.000000  0.00000  0.447214  

TF-IDF Features for Descriptions:
   chicken  cucumber  eggs    garlic  greens    juicy    light   pieces  \
0  0.00000  0.000000   0.5  0.000000     0.5  0.00000  0.00000  0.00000   
1  0.00000  0.707107   0.0  0.707107     0.0  0.00000  0.00000  0.00000   
2  0.57735  0.000000   0.0 

In [41]:
print("One-hot encoded shape:", onehot_columns.shape)
print("TF-IDF Dish Name shape:", df_dish_name.shape)
print("TF-IDF Description shape:", df_description.shape)

One-hot encoded shape: (4, 4)
TF-IDF Dish Name shape: (4, 13)
TF-IDF Description shape: (4, 12)


In [46]:
print(type(onehot_columns))
print(type(df_dish_name))
print(type(df_description))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
<class 'scipy.sparse._csr.csr_matrix'>


In [47]:
df_dish_name = df_dish_name.to_numpy()
df_description = df_description.toarray()
combined_features = np.hstack([onehot_columns, df_dish_name, df_description])

combined_df = pd.DataFrame(combined_features, columns=(
    list(encoder.categories_[0]) +                
    list(tf1.get_feature_names_out()) +           
    list(tf2.get_feature_names_out())             
))

print("Combined features dataFrame:")
print(combined_df.head())


Combined Features DataFrame:
   Breakfast  Salads  Second Meal  Soups  chicken  cucumbers      eggs  \
0        1.0     0.0          0.0    0.0  0.00000    0.00000  0.707107   
1        0.0     1.0          0.0    0.0  0.00000    0.57735  0.000000   
2        0.0     0.0          0.0    1.0  0.57735    0.00000  0.000000   
3        0.0     0.0          1.0    0.0  0.00000    0.00000  0.000000   

     fillet      fish   garlic  ...  eggs    garlic  greens    juicy    light  \
0  0.000000  0.000000  0.00000  ...   0.5  0.000000     0.5  0.00000  0.00000   
1  0.000000  0.000000  0.57735  ...   0.0  0.707107     0.0  0.00000  0.00000   
2  0.000000  0.000000  0.00000  ...   0.0  0.000000     0.0  0.00000  0.57735   
3  0.447214  0.447214  0.00000  ...   0.0  0.000000     0.0  0.57735  0.00000   

    pieces  scrambled  served     soup    trout  
0  0.00000        0.5     0.5  0.00000  0.00000  
1  0.00000        0.0     0.0  0.00000  0.00000  
2  0.00000        0.0     0.0  0.57735  0.00

In [None]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_cv, label=y_cv)

param = {'objective': 'regression',
         'boosting': 'gbdt',  
         'metric': 'l2_root',
         'learning_rate': 0.05, 
         'num_iterations': 350,
         'num_leaves': 31,
         'max_depth': -1,
         'min_data_in_leaf': 15,
         'bagging_fraction': 0.85,
         'bagging_freq': 1,
         'feature_fraction': 0.55
         }

lgbm = lgb.train(params=param,
                 verbose_eval=50,
                 train_set=train_data,
                 valid_sets=[test_data])
