In [None]:
import pandas as pd
df = pd.read_csv("yelp_ml_master.csv")

In [None]:
import numpy as np
import ast
df['hours'] = df['hours'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('{') and x.endswith('}') else x)

In [None]:
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('{') and x.endswith('}') else x)

In [None]:
def time_to_minutes(time_str):
    # Split the time string into hours and minutes
    hours, minutes = map(int, time_str.split(':'))
    # Convert hours to minutes and add to minutes
    return hours * 60 + minutes


In [None]:
def get_average_time(row):
    hours = row['hours']
    total_time = []
    for day, hour in hours.items():
        str_list = hour.split('-')
        time = time_to_minutes(str_list[1]) - time_to_minutes(str_list[0])
        if time < 0:
            time = time_to_minutes(str_list[1]) + 24 * 60 - time_to_minutes(str_list[0])
        if time > 0 : total_time.append(time)
    return np.mean(total_time)

In [None]:
df = df.dropna()

In [None]:
df['mean_time'] = df.apply(get_average_time, axis=1)

In [None]:
brand_counts = df.groupby(['name']).size().reset_index(name='count')
brand_counts.sort_values('count')

In [None]:
df = df.merge(brand_counts, on = 'name')

In [None]:
import pandas as pd
import ast

# Assuming 'df' is your DataFrame and it has a 'stars_y' column for ratings

# Define a function to safely convert stringified lists to actual lists
def string_to_list(string_list):
    try:
        return ast.literal_eval(string_list)
    except ValueError:
        # In case of an error, return an empty list
        return []
    except SyntaxError:
        # Handle strings that are not in list format (e.g., single quotes missing)
        return [string_list.strip("[]").replace("'", "").split(", ")]

# Apply the function to the entire 'categories_list' column
df['categories_list'] = df['categories_list'].apply(string_to_list)

In [None]:

# Now, you can explode the 'categories_list' column
exploded_df = df.explode('categories_list')

# Group by the individual categories and calculate the mean rating and count
category_stats = exploded_df.groupby('categories_list')['stars_y'].agg(['mean', 'count'])

# Convert the Series with multi-level columns to a DataFrame if needed
category_stats_df = category_stats.reset_index()

# Rename columns for clarity
category_stats_df.columns = ['Category', 'AverageRating', 'Count']

In [None]:
category_stats_df = category_stats_df[~category_stats_df['Category'].isin(['Restaurants', 'Restaurants	', 'Food'])]
category_stats_df = category_stats_df[~category_stats_df['Category'].str.strip().isin(['Restaurants', 'Food'])]

In [None]:
category_stats_df = category_stats_df.sort_values('Count')
category_stats_df = category_stats_df.tail(32)

In [None]:
selected_category = list(category_stats_df['Category'])

In [None]:
attribute_set = {}

for attribute in df.attributes:
    if attribute is not None:
        for key, value in attribute.items():
            if key in attribute_set.keys() :
                attribute_set[key].add(value)
            else:
                attribute_set[key] = set()
                attribute_set[key].add(value)

selected_attributes = []
for key, value in attribute_set.items():
    if len(value) <= 3 and 'True' in value:
        selected_attributes.append(key)

In [None]:
selected_attributes 

In [None]:
import pandas as pd

# Function to process a single row's attribute dictionary
def process_attributes(attr_dict):
    # Initialize the array with -1
    attr_array = [0] * len(selected_attributes)
    if attr_dict is None:
        return attr_array
    
    for i, attr in enumerate(selected_attributes):
        if attr in attr_dict:
            attr_array[i] = 1 if attr_dict[attr] == 'True' else 0
    
    return attr_array

# Apply the function to each row
df['attribute_array'] = df['attributes'].apply(process_attributes)

# Now df['attribute_array'] contains the desired arrays for each row

In [None]:
import pandas as pd

# Function to process a single row's attribute dictionary
def process_attributes(attr_dict):
    # Initialize the array with -1
    attr_array = [0] * len(selected_attributes)
    if attr_dict is None:
        return attr_array
    
    for i, attr in enumerate(selected_attributes):
        if attr in attr_dict:
            attr_array[i] = 1 if attr_dict[attr] == 'True' else 0
    
    return attr_array

# Apply the function to each row
df['attribute_array'] = df['attributes'].apply(process_attributes)

In [None]:
def process_category(c_set):
    # Initialize the array with -1
    attr_array = [0] * len(selected_category)
    if c_set is None:
        return attr_array
    
    for i in range(len(selected_category)):
        if selected_category[i]  in c_set: 
            attr_array[i] = 1
    
    return attr_array

# Apply the function to each row
df['category_array'] = df['categories_list'].apply(process_category)

In [None]:
df.columns

In [None]:
df = df.rename(columns={'count': 'brand_size'})
df = df.rename(columns={'density': 'density_state'})

In [None]:
cols = ['stars_y', 'stars_x', 'population_postal', 'density_postal', 'population_city', 'density_city', 'population_state', 'density_state', 'mean_time', 'brand_size', 'category_counts', 'category_array', 'attribute_array' ]

In [None]:
df_ml = df[cols]

In [None]:
df_ml = df_ml.dropna()

In [None]:
cols= ['category_counts']
for col in cols:
    df_ml[col] = df_ml[col].apply(lambda x: [int(i) for i in ast.literal_eval(x)] if isinstance(x, str) and x.startswith('[') and x.endswith(']') else x)

In [None]:
col_names = ['category_counts']

for col_name in col_names:
    mean_name, std_name, median_name, max_name, min_name = col_name + '_mean', col_name  + '_std', col_name  + '_median', col_name  + '_max', col_name + '_min'
    df_ml[mean_name] = df_ml[col_name].apply(lambda counts: pd.Series(counts).mean() if len(counts) > 0 else None)
    df_ml[std_name] = df_ml[col_name].apply(lambda counts: pd.Series(counts).std() if len(counts) > 0 else None)
    df_ml[median_name] = df_ml[col_name].apply(lambda counts: pd.Series(counts).median() if len(counts) > 0 else None)
    df_ml[max_name] = df_ml[col_name].apply(lambda counts: pd.Series(counts).max() if len(counts) > 0 else None)
    df_ml[min_name] = df_ml[col_name].apply(lambda counts: pd.Series(counts).min() if len(counts) > 0 else None)

In [None]:
df_ml = df_ml.drop(['category_counts'], axis=1)

In [None]:
df_ml = df_ml.dropna()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

# Assuming df_ml is your DataFrame and all columns are numerical
X = df_ml.drop(['stars_x','stars_y', 'attribute_array', 'category_array'], axis=1)  # Use all columns except 'stars'
one_hot_attribute = np.array(df_ml['attribute_array'].tolist())
one_hot_category = np.array(df_ml['category_array'].tolist())


X = np.concatenate([one_hot_attribute,X], axis=1)
X = np.concatenate([one_hot_category,X], axis=1)


y = df_ml['stars_y']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply standardization to the features
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")


In [None]:
arr_name = list(df_ml.drop(['stars_x','stars_y', 'attribute_array', 'category_array'], axis=1).columns)

In [None]:
selected_attributes_p= []
for element in selected_attributes:
    selected_attributes_p.append("[Attribute] "+ element)

selected_category_p =[]
for element in selected_category:
    selected_category_p.append("[Category] "+ element)

In [None]:
arr_name =  selected_attributes_p + selected_category_p + arr_name
model.coef_

In [None]:
df_coefficients = pd.DataFrame({
    'Feature': arr_name,
    'Coefficient': model.coef_
})


In [None]:
df_coefficients.sort_values('Coefficient')

In [None]:
df_coefficients.to_csv("ml_coeff.csv")

In [None]:
arr_name

In [None]:
X.shape, len(arr_name)

In [None]:
len(arr_name)

In [None]:
len()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

X = df_ml.drop(['stars_x','stars_y', 'attribute_array', 'category_array'], axis=1)  # Use all columns except 'stars'
one_hot_attribute = np.array(df_ml['attribute_array'].tolist())
one_hot_category = np.array(df_ml['category_array'].tolist())

# Example: Assuming col1, col2, col3 need degree 5 and the rest degree 2
columns_degree_5 = [ 'mean_time', 'brand_size']
columns_degree_2 = [col for col in X.columns if col not in columns_degree_5]

# Creating polynomial features for degree 5
poly_5 = PolynomialFeatures(degree=5)
X_poly_5 = poly_5.fit_transform(df_ml[columns_degree_5])

# Creating polynomial features for degree 2
poly_2 = PolynomialFeatures(degree=2)
X_poly_2 = poly_2.fit_transform(df_ml[columns_degree_2])

# Combine the polynomial features
X_poly_combined = np.concatenate([X_poly_5, X_poly_2], axis=1)

# Concatenate with one_hot_encoded features
X_final = np.concatenate([one_hot_attribute, one_hot_category, X_poly_combined], axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

y = df_ml['stars_y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model and make predictions
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")


In [None]:
_, _, y_train_real, y_test_real = train_test_split(X_scaled, df_ml['stars_x'], test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import Lasso

# Initialize and train the model with L1 regularization
lasso_model = Lasso(alpha=0.01)  # You can adjust the alpha parameter
lasso_model.fit(X_train, y_train)

# Predict and evaluate
lasso_predictions = lasso_model.predict(X_test)
rounded_lasso_predictions = np.array([round(p) for p in lasso_predictions])
mse = mean_squared_error(y_test, lasso_predictions)
print(f"Mean Squared Error with L1 Regularization: {mse}")

# Access model coefficients
lasso_coefficients = lasso_model.coef_
print(lasso_coefficients)


In [None]:
from sklearn.linear_model import Ridge

# Initialize and train the model with L2 regularization
ridge_model = Ridge(alpha=0.00001)  # You can adjust the alpha parameter
ridge_model.fit(X_train, y_train)

# Predict and evaluate
ridge_predictions = ridge_model.predict(X_test)
rounded_ridge_predictions = np.array([round(p) for p in ridge_predictions])
mse = mean_squared_error(y_test, ridge_predictions)
print(f"Mean Squared Error with L2 Regularization: {mse}")

# Access model coefficients
ridge_coefficients = ridge_model.coef_
print(ridge_coefficients)

In [None]:
from sklearn.linear_model import RidgeCV

# Define a range of lambda values to test
alpha_values = np.logspace(-4, 4, 100)  # For example, values from 0.0001 to 10000

# Initialize RidgeCV
ridge_cv = RidgeCV(alphas=alpha_values, store_cv_values=True)

# Fit the model
ridge_cv.fit(X_train, y_train)

# Best lambda
best_lambda = ridge_cv.alpha_
print(f"Best lambda: {best_lambda}")

# You can also access the mean squared errors for different alphas
mse_values = np.mean(ridge_cv.cv_values_, axis=0)


In [None]:
def round_to_nearest_half(number):
    return round(number * 2) / 2

rounded_predictions = np.array([round_to_nearest_half(p) for p in ridge_predictions]) 
correct_predictions = (y_test_real == rounded_predictions)
correctness_ratio = correct_predictions.mean()
print(f"Correctness Ratio: {correctness_ratio}")

In [None]:
mse_values 

In [None]:
df_ml.to_csv("final_ml_data.csv")

In [None]:
import matplotlib.pyplot as plt

plt.scatter(predictions, y_test, s= 5, alpha=0.3)

x = [0, 5]
y = [0, 5]



plt.plot(x, y, color = "red")

plt.ylabel('Real Avg.Rating')
plt.xlabel('Predicted Avg.Rating')

plt.grid(True, linestyle='--', linewidth=0.5, color='gray')


plt.ylim(0,5)
plt.xlim(0,5)

In [None]:
np.max(ridge_predictions)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test_real,ridge_predictions, s= 5, alpha=0.1)
plt.ylim(0,5)

In [None]:
brand_counts.to_csv("yelp_brands.csv")

In [None]:

rounded_predictions = np.array([round_to_nearest_half(p) for p in ridge_predictions]) 
correct_predictions = abs(y_test_real - predictions) <= 0.5
correctness_ratio = correct_predictions.mean()
print(f"Correctness Ratio: {correctness_ratio}")

In [None]:
df_ml