In [None]:
# perform data wrangling and Ridge regression

import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OrdinalEncoder

df_train = pd.read_csv('train.csv')
print(df_train.head())
df_val = pd.read_csv('validation.csv')
print(df_val.head())

print(df_train.shape)
print(df_val.shape)

print(df_train.columns)
print(df_val.columns)

print(df_train.nunique().sort_values())
print(df_train.isnull().sum().sort_values())

print(df_val.nunique().sort_values())
print(df_val.isnull().sum().sort_values())

df_train['Rating'].fillna(df_train['Rating'].median(), inplace = True)
df_train['Reviews'].fillna(df_train['Reviews'].median(), inplace = True)
df_train['Takeout.option'].fillna(df_train['Takeout.option'].mode()[0], inplace = True)
df_train['Dine.in.option'].fillna(df_train['Dine.in.option'].mode()[0], inplace = True)

df_val['Rating'].fillna(df_val['Rating'].median(), inplace = True)
df_val['Takeout.option'].fillna(df_val['Takeout.option'].mode()[0], inplace = True)
df_val['Dine.in.option'].fillna(df_val['Dine.in.option'].mode()[0], inplace = True)

print(df_train.isnull().sum().sort_values())
print(df_val.isnull().sum().sort_values())

print(df_train['Place.name'])

df_ft = df_train.drop(['Dine.in.option', 'Takeout.option', 'Place.name', 'Reviews'], axis = 1)
df_val.drop(['Dine.in.option', 'Takeout.option', 'Place.name'], axis = 1, inplace = True)

print(df_ft.shape)
print(df_ft.head())

print(df_val.shape)
print(df_val.head())

df_cf = df_ft[['Delivery.option', 'Price', 'Place.type', 'Region']]
df_val_cf = df_val[['Delivery.option', 'Price', 'Place.type', 'Region']]

oe = OrdinalEncoder()
cf_encoded = oe.fit_transform(df_cf)
print(cf_encoded)

rating = df_ft[['Rating']].to_numpy()
print(rating)

cf_val_encoded = oe.fit_transform(df_val_cf)
val_rating = df_val[['Rating']].to_numpy()

X_train = np.concatenate([cf_encoded, rating], axis = 1)
print(X.shape)

y_train = df_train[['Reviews']].to_numpy()
print(y.shape)

X_val = np.concatenate([cf_val_encoded, val_rating], axis = 1)
print(X_val.shape)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 0.5)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_val)

y_pre = pd.DataFrame(y_pred, columns = ['rating'])

df_reval = pd.read_csv('validation.csv')
base_result = pd.concat([df_reval[['Place.name']], y_pre], axis = 1)
print(base_result)


In [None]:
# perform data wrangling and Lasso regression 

import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OrdinalEncoder

df_train = pd.read_csv('train.csv')
print(df_train.head())
df_val = pd.read_csv('validation.csv')
print(df_val.head())

print(df_train.shape)
print(df_val.shape)

print(df_train.columns)
print(df_val.columns)

print(df_train.nunique().sort_values())
print(df_train.isnull().sum().sort_values())

print(df_val.nunique().sort_values())
print(df_val.isnull().sum().sort_values())

df_train['Rating'].fillna(df_train['Rating'].median(), inplace = True)
df_train['Reviews'].fillna(df_train['Reviews'].median(), inplace = True)
df_train['Takeout.option'].fillna(df_train['Takeout.option'].mode()[0], inplace = True)
df_train['Dine.in.option'].fillna(df_train['Dine.in.option'].mode()[0], inplace = True)

df_val['Rating'].fillna(df_val['Rating'].median(), inplace = True)
df_val['Takeout.option'].fillna(df_val['Takeout.option'].mode()[0], inplace = True)
df_val['Dine.in.option'].fillna(df_val['Dine.in.option'].mode()[0], inplace = True)

print(df_train.isnull().sum().sort_values())
print(df_val.isnull().sum().sort_values())

print(df_train['Place.name'])

df_ft = df_train.drop(['Dine.in.option', 'Takeout.option', 'Place.name', 'Reviews'], axis = 1)
df_val.drop(['Dine.in.option', 'Takeout.option', 'Place.name'], axis = 1, inplace = True)

print(df_ft.shape)
print(df_ft.head())

print(df_val.shape)
print(df_val.head())

df_cf = df_ft[['Delivery.option', 'Price', 'Place.type', 'Region']]
df_val_cf = df_val[['Delivery.option', 'Price', 'Place.type', 'Region']]

oe = OrdinalEncoder()
cf_encoded = oe.fit_transform(df_cf)
print(cf_encoded)

rating = df_ft[['Rating']].to_numpy()
print(rating)

cf_val_encoded = oe.fit_transform(df_val_cf)
val_rating = df_val[['Rating']].to_numpy()

X_train = np.concatenate([cf_encoded, rating], axis = 1)
print(X.shape)

y_train = df_train[['Reviews']].to_numpy()
print(y.shape)

X_val = np.concatenate([cf_val_encoded, val_rating], axis = 1)
print(X_val.shape)

from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 0.5)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_val)

y_pre = pd.DataFrame(y_pred, columns = ['rating'])

df_reval = pd.read_csv('validation.csv')
base_result = pd.concat([df_reval[['Place.name']], y_pre], axis = 1)
print(base_result)


