# Загрузка Pandas и очистка данных

In [192]:
import pandas as pd
import ast
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from collections import namedtuple

In [112]:
df = pd.read_csv('main_task.xls')

In [201]:
# Helper functions are defined in this cell

def drop_by_type(d, t):
    '''
    Drops columns of the dataframe based on their type
    Params:
        d - target dataframe
        t - type (from types returned by dtype) 
            provided as text (for example, 'float64')
    '''
    df_temp = d.copy()
    to_drop = [c for c in d.columns if t == str(d[c].dtype)]
    df_temp.drop(to_drop, axis=1, inplace=True)
    print(f"List of dropped columns (by type {t}): {to_drop}")
    return df_temp


def from_website(url):
    '''
    Gets additional data about the restaurant from tripadvisor 
    website if the proper url is provided.
    '''
    r = requests.get(f"https://www.tripadvisor.com{url}")
    soup = BeautifulSoup(r.content)
    
    tmp = soup.find_all('span', {"class":"ratingDate"})
    rev_dates_lst = [datetime.strptime(i['title'], '%B %d, %Y') for i in tmp]
    
    return rev_dates_lst

In [113]:
df

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963
...,...,...,...,...,...,...,...,...,...,...
39995,id_499,Milan,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,4.5,$$ - $$$,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414
39996,id_6340,Paris,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,3.5,$$ - $$$,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036
39997,id_1649,Stockholm,"['Japanese', 'Sushi']",1652.0,4.5,,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615
39998,id_640,Warsaw,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,4.0,$$ - $$$,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838


In [117]:
df['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

In [121]:
df['City'].nunique()

31

In [146]:
# determining unique cuisine styles and their value counts
# ast.literal_eval is required because lists of styles are
# represented as strings in the dataframe
# IMPORTANT: nan values are replaced by the fake list "['dummy_style']"
cuisine_styles = df['Cuisine Style'].fillna("['dummy_style']").apply(ast.literal_eval)
cuisine_styles.explode().value_counts()

Vegetarian Friendly    11189
European               10060
dummy_style             9283
Mediterranean           6277
Italian                 5964
                       ...  
Yunnan                     1
Latvian                    1
Xinjiang                   1
Salvadoran                 1
Burmese                    1
Name: Cuisine Style, Length: 126, dtype: int64

In [147]:
# average number of cuisine styles 
cuisine_styles.apply(len).mean()

2.6224

In [105]:
df = drop_by_type(df, 'object')
avg_num_rev = df['Number of Reviews'].mean()
med_num_rev = df['Number of Reviews'].median()
min_num_rev = df['Number of Reviews'].min()
max_num_rev = df['Number of Reviews'].max()
df['Number of Reviews'].fillna(0, inplace=True)
df.info()

List of dropped columns (by type object): ['Restaurant_id', 'City', 'Cuisine Style', 'Price Range', 'Reviews', 'URL_TA', 'ID_TA']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ranking            40000 non-null  float64
 1   Rating             40000 non-null  float64
 2   Number of Reviews  40000 non-null  float64
dtypes: float64(3)
memory usage: 937.6 KB


In [106]:
df['Number of Reviews'].describe()

count    40000.000000
mean       116.889700
std        287.729821
min          0.000000
25%          7.000000
50%         28.000000
75%        105.000000
max       9660.000000
Name: Number of Reviews, dtype: float64

In [203]:
df['all_review_dates'] = df['URL_TA'].apply(from_website)

KeyboardInterrupt: 

# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [107]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
#X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
X = df.drop(['Rating'], axis = 1)
y = df['Rating']

In [108]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [109]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [85]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [110]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [111]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.42394733542568536


In [103]:
avg_num_rev, med_num_rev, min_num_rev, max_num_rev

(124.82547988359985, 33.0, 2.0, 9660.0)