In [26]:
!pip install pandas numpy matplotlib seaborn scikit-learn catboost plotly kneed category_encoders



In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor


In [28]:
df=pd.read_csv("ramen-ratings.csv")

In [29]:
df.set_index('Review #', inplace=True)

In [30]:
df

Unnamed: 0_level_0,Brand,Variety,Style,Country,Stars,Top Ten
Review #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,
2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
2576,Ching's Secret,Singapore Curry,Pack,India,3.75,
...,...,...,...,...,...,...
5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,
4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,
3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,
2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,


In [31]:
df['Stars'] = pd.to_numeric(df['Stars'], errors='coerce')
df['Stars'] = df['Stars'].fillna(0)

In [32]:
X=df.drop('Stars', axis=1)
y=df['Stars']
X_train,X_test,y_train,y_test =train_test_split(X,y, test_size=0.2, random_state=42)

In [33]:
X_train = X_train.drop('Top Ten', axis=1, errors='ignore') 
X_test = X_test.drop('Top Ten', axis=1, errors='ignore')

In [34]:
X_train

Unnamed: 0_level_0,Brand,Variety,Style,Country
Review #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
870,SuperMi,Mi Keriting Rasa Ayam Bawang,Pack,Indonesia
530,Mee Jang,Artificial Shrimp Tom yum,Bowl,Thailand
216,Maruchan,Instant Lunch Roast Chicken Flavor,Cup,USA
2219,Fantastic,Noodles Crispy Bacon Flavour,Cup,Australia
557,Batchelors,Super Noodles Souther Fried Chicken,Pack,UK
...,...,...,...,...
942,Kamfen,E-men Lobster Soup,Pack,China
1485,Nongshim,Spicy Tonkotsu Noodle Soup,Pack,USA
1450,Nissin,Japanese Ramen Tokyo Shoyu Instant Noodles Wit...,Pack,Singapore
1286,Batchelors,Super Noodles Roast Beef & Onion Flavour,Pack,UK


In [35]:
style_imputer = SimpleImputer(strategy='most_frequent')
X_train.loc[:, 'Style'] = style_imputer.fit_transform(X_train[['Style']]).ravel() 
X_test.loc[:, 'Style'] = style_imputer.transform(X_test[['Style']]).ravel()

In [36]:
print(X_train.isnull().sum())

Brand      0
Variety    0
Style      0
Country    0
dtype: int64


In [37]:
categorical_cols = ['Brand', 'Style', 'Country'] 


In [38]:
tfidf_vec=TfidfVectorizer(max_features=100)
X_train_tfidf=tfidf_vec.fit_transform(X_train['Variety'])
X_test_tfidf = tfidf_vec.transform(X_test['Variety'])

In [39]:
cat_encoder = CatBoostEncoder(cols=categorical_cols)
X_train_cat=cat_encoder.fit_transform(X_train[categorical_cols],y_train)
X_test_cat=cat_encoder.transform(X_test[categorical_cols])

In [40]:
scaler=StandardScaler()
X_train_catsc=scaler.fit_transform(X_train_cat)
X_test_catsc=scaler.transform(X_test_cat)

In [41]:
X_train_res = np.hstack([X_train_catsc, X_train_tfidf.toarray()]) 
X_test_res = np.hstack([X_test_catsc, X_test_tfidf.toarray()])

In [42]:
kmns=KMeans(n_clusters=5, random_state=42)
X_train_clus=kmns.fit_predict(X_train_res).reshape(-1,1)
X_test_clus=kmns.predict(X_test_res).reshape(-1,1)


In [43]:
X_train_res = np.hstack([X_train_res, X_train_clus]) 
X_test_res = np.hstack([X_test_res, X_test_clus])

In [44]:
final_model=RandomForestRegressor(random_state=42)
final_model.fit(X_train_res, y_train)

In [45]:
y_pred = final_model.predict(X_test_res) 
print( r2_score(y_test, y_pred))

0.23567604227535732


In [46]:
print(mean_squared_error(y_test, y_pred) ** 0.5)

0.8333840968065551
