# Загрузка Pandas и очистка данных

In [1]:
import pandas as pd
import numpy as np # для работы с массивами
import re
from datetime import datetime

In [2]:
df = pd.read_csv('main_task.csv')

In [3]:
# Ваш код по очистке данных и генерации новых признаков
# При необходимости добавьте ячейки

In [4]:
df.head()

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


# Clearing and create new fiature for Price Range

In [6]:
df['Price Range'] = df['Price Range'].fillna(df['Price Range'].value_counts().index[0])

In [7]:
price_range_dict = {'$$ - $$$' : 0, '$' : 1, '$$$$': 2}

In [8]:
df['Price Range'] = df['Price Range'].replace(to_replace=price_range_dict)

In [9]:
df = df.astype({'Price Range': np.float64})

In [10]:
df['Number of Reviews'] = df['Number of Reviews'].fillna(df['Number of Reviews'].mean())

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        40000 non-null  float64
 6   Number of Reviews  40000 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(4), object(6)
memory usage: 3.1+ MB


# Clearing and create new fiature for Cuisine Style

In [12]:
def transform_str_to_list(cell):
    if cell is not np.nan:
        if isinstance(cell, str):
            return cell[1:-1].replace('\'', '').split(',')
    else:
        return []

In [13]:
def unique_cuisine_style(data):
    cuisine = set()
    for cell in data:
        for item in cell:
            cuisine.add(item.strip())
    return cuisine

In [14]:
def find_item(cell):
    if item in cell:
        return 1
    return 0

In [15]:
df['Cuisine Style'] = df.apply(lambda row: transform_str_to_list(row['Cuisine Style']), axis = 1)

In [16]:
unique_cuisine = unique_cuisine_style(df['Cuisine Style'])

In [17]:
for item in unique_cuisine:
    df[item] = df['Cuisine Style'].apply(find_item)

In [18]:
df['Cuisine Style'] = df['Cuisine Style'].apply(lambda x: len(x))

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 135 entries, Restaurant_id to Korean
dtypes: float64(4), int64(126), object(5)
memory usage: 41.2+ MB


# Create delta days

In [20]:
def delta_days(cell):
    delta = 0
    match = re.findall(r'\d{2}\/\d{2}\/\d{4}', cell)
    if isinstance(match, list) and len(match) > 1:
        day1 = datetime.strptime(match[0], '%m/%d/%Y').date()
        day2 = datetime.strptime(match[1], '%m/%d/%Y').date()
        delta = (day1-day2).days
    return delta

In [21]:
df['Delta Days Review'] = df.apply(lambda row: delta_days(row['Reviews']), axis = 1)

# Create Ranking Review

In [22]:
def ranking_to_review(ranking, review):
    return ranking/review

In [23]:
df['Ranking Review'] = df.apply(lambda row: ranking_to_review(row['Ranking'], row['Number of Reviews']), axis = 1)

# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [24]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
y = df['Rating']

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 135 entries, City to Ranking Review
dtypes: float64(4), int64(127), object(4)
memory usage: 41.2+ MB


In [26]:
X.drop(['City', 'Reviews', 'URL_TA', 'ID_TA'], inplace=True, axis=1, errors='ignore')

In [27]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [28]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [29]:
X

Unnamed: 0,Cuisine Style,Ranking,Price Range,Number of Reviews,Middle Eastern,Albanian,Xinjiang,Grill,Minority Chinese,Fujian,...,Sri Lankan,International,Mongolian,Indonesian,Swedish,Vegan Options,Southwestern,Korean,Delta Days Review,Ranking Review
0,3,5570.0,0.0,194.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,41,28.711340
1,0,1537.0,0.0,10.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,382,153.700000
2,7,353.0,2.0,688.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0.513081
3,0,3458.0,0.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1152.666667
4,3,621.0,0.0,84.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,272,7.392857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,4,500.0,0.0,79.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,34,6.329114
39996,5,6341.0,0.0,542.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,11.699262
39997,2,1652.0,0.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3127,413.000000
39998,5,641.0,0.0,70.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,23,9.157143


In [30]:
y

0        3.5
1        4.0
2        4.5
3        5.0
4        4.0
        ... 
39995    4.5
39996    3.5
39997    4.5
39998    4.0
39999    3.0
Name: Rating, Length: 40000, dtype: float64

# Создаём, обучаем и тестируем модель

In [31]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [32]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [33]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.37165092499999997


In [34]:
df.to_csv('submission.csv', index = False)