# Загрузка Pandas и очистка данных

In [6]:
import pandas as pd
import ast
from bs4 import BeautifulSoup
import requests
import numpy as np
from multiprocessing import  Pool
from datetime import datetime
from collections import namedtuple
import worker

In [7]:
df = pd.read_csv('main_task.xls')
df_cities = pd.read_csv('worldcities.csv')
df_small = df.iloc[:20,:].copy()
df_small

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963
5,id_1418,Oporto,,1419.0,3.0,,2.0,"[['There are better 3 star hotel bars', 'Amazi...",/Restaurant_Review-g189180-d12503536-Reviews-D...,d12503536
6,id_1720,Milan,"['Italian', 'Pizza']",1722.0,4.0,$,50.0,"[['Excellent simple local eatery.', 'Excellent...",/Restaurant_Review-g187849-d5808504-Reviews-Pi...,d5808504
7,id_825,Bratislava,['Italian'],826.0,3.0,,9.0,"[['Wasting of money', 'excellent cuisine'], ['...",/Restaurant_Review-g274924-d3199765-Reviews-Ri...,d3199765
8,id_2690,Vienna,,2692.0,4.0,,,"[[], []]",/Restaurant_Review-g190454-d12845029-Reviews-G...,d12845029
9,id_4209,Rome,"['Italian', 'Pizza', 'Fast Food']",4210.0,4.0,$,55.0,"[['Clean efficient staff', 'Nice little pizza ...",/Restaurant_Review-g187791-d8020681-Reviews-Qu...,d8020681


In [8]:
# PRE-PROCESSING worldcities dataset from https://www.kaggle.com/viswanathanc/world-cities-datasets
# leaving only required columns from worldcities data set
df_cities = df_cities[['city_ascii','capital','population']]
# the assumption here is that most of the capitals will be listed as primary
# and we don't really care about other classification because the variable will be
# binary in the end (is_capital = 0 or 1)
df_cities['capital'].fillna('other', inplace=True) 
# the assumption here is that most important cities will have the population
# mentioned in the data set
df_cities['population'].fillna(0, inplace=True)
# leaving only cities with max population - this will eliminate duplicates
# in case multiple cities with the same name exist
df_cities.sort_values(by=['city_ascii','population'], ascending=[True,False], inplace=True)
df_cities.drop_duplicates(subset='city_ascii', keep='first', inplace=True)
#df_cities['capital'] = df_cities.map()
df_cities['city_ascii'].value_counts()

Redding        1
Destin         1
Campos         1
Abakaliki      1
Callao         1
              ..
Cresson        1
Pemba          1
Westford       1
Arkadelphia    1
Holman         1
Name: city_ascii, Length: 13482, dtype: int64

In [21]:
# Helper functions are defined in this cell

def drop_by_type(d, t):
    '''
    Drops columns of the dataframe based on their type
    Params:
        d - target dataframe
        t - type (from types returned by dtype) 
            provided as text (for example, 'float64')
    '''
    df_temp = d.copy()
    to_drop = [c for c in d.columns if t == str(d[c].dtype)]
    df_temp.drop(to_drop, axis=1, inplace=True)
    print(f"List of dropped columns (by type {t}): {to_drop}")
    return df_temp


def from_website(url):
    '''
    Gets additional data about the restaurant from tripadvisor 
    website if the proper url is provided.
    '''
    #print(f"https://www.tripadvisor.com{url}")
    r = requests.get(f"https://www.tripadvisor.com{url}", timeout=2)
    soup = BeautifulSoup(r.content)
    
    tmp = soup.find_all('span', {"class":"ratingDate"})
    rev_dates_lst = [datetime.strptime(i['title'], '%B %d, %Y') for i in tmp]
    
    return rev_dates_lst

def apply_to_dataframe(df):
    df_result = df.copy()
    df_result['all_review_dates'] = df_result['URL_TA'].apply(from_website)
    return df_result

def parallelize_dataframe(df, func, n_cores=12):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [5]:
df['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

In [6]:
df['City'].nunique()

31

In [7]:
# determining unique cuisine styles and their value counts
# ast.literal_eval is required because lists of styles are
# represented as strings in the dataframe
# IMPORTANT: nan values are replaced by the fake list "['dummy_style']"
cuisine_styles = df['Cuisine Style'].fillna("['dummy_style']").apply(ast.literal_eval)
cuisine_styles.explode().value_counts()

Vegetarian Friendly    11189
European               10060
dummy_style             9283
Mediterranean           6277
Italian                 5964
                       ...  
Yunnan                     1
Burmese                    1
Latvian                    1
Xinjiang                   1
Salvadoran                 1
Name: Cuisine Style, Length: 126, dtype: int64

In [8]:
# average number of cuisine styles 
cuisine_styles.apply(len).mean()

2.6224

In [105]:
df = drop_by_type(df, 'object')
avg_num_rev = df['Number of Reviews'].mean()
med_num_rev = df['Number of Reviews'].median()
min_num_rev = df['Number of Reviews'].min()
max_num_rev = df['Number of Reviews'].max()
df['Number of Reviews'].fillna(0, inplace=True)
df.info()

List of dropped columns (by type object): ['Restaurant_id', 'City', 'Cuisine Style', 'Price Range', 'Reviews', 'URL_TA', 'ID_TA']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ranking            40000 non-null  float64
 1   Rating             40000 non-null  float64
 2   Number of Reviews  40000 non-null  float64
dtypes: float64(3)
memory usage: 937.6 KB


In [106]:
df['Number of Reviews'].describe()

count    40000.000000
mean       116.889700
std        287.729821
min          0.000000
25%          7.000000
50%         28.000000
75%        105.000000
max       9660.000000
Name: Number of Reviews, dtype: float64

In [5]:
%timeit -n1 -r1 df_small['all_review_dates'] = df_small['URL_TA'].apply(from_website)

https://www.tripadvisor.com/Restaurant_Review-g187147-d1912643-Reviews-R_Yves-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g189852-d7992032-Reviews-Buddha_Nepal-Stockholm.html
https://www.tripadvisor.com/Restaurant_Review-g186338-d8632781-Reviews-ROKA_Mayfair-London_England.html
https://www.tripadvisor.com/Restaurant_Review-g187323-d1358776-Reviews-Esplanade-Berlin.html
https://www.tripadvisor.com/Restaurant_Review-g187309-d6864963-Reviews-Augustiner_Schutzen_Garten-Munich_Upper_Bavaria_Bavaria.html
https://www.tripadvisor.com/Restaurant_Review-g189180-d12503536-Reviews-Dick_s_Bar-Porto_Porto_District_Northern_Portugal.html
https://www.tripadvisor.com/Restaurant_Review-g187849-d5808504-Reviews-Pizzeria_La_Costiera-Milan_Lombardy.html
https://www.tripadvisor.com/Restaurant_Review-g274924-d3199765-Reviews-Ristorante_Italiano_San_Cono-Bratislava_Bratislava_Region.html
https://www.tripadvisor.com/Restaurant_Review-g190454-d12845029-Reviews-Grunstern-Vienna.html
ht

In [22]:
if __name__ ==  '__main__': 
    df_crawled = parallelize_dataframe(df, worker.apply_to_dataframe)

In [25]:
df_crawled.to_csv('with_additional_data_fromTA.csv')

# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [107]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
#X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
X = df.drop(['Rating'], axis = 1)
y = df['Rating']

In [108]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [109]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [85]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [110]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [111]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.42394733542568536


In [103]:
avg_num_rev, med_num_rev, min_num_rev, max_num_rev

(124.82547988359985, 33.0, 2.0, 9660.0)

In [23]:
df_small

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,all_review_dates
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,"[2020-02-14 00:00:00, 2019-12-20 00:00:00, 201..."
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,"[2020-02-27 00:00:00, 2017-07-06 00:00:00, 201..."
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,"[2020-09-25 00:00:00, 2020-09-05 00:00:00, 202..."
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,"[2016-02-19 00:00:00, 2012-05-04 00:00:00, 201..."
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,"[2020-09-10 00:00:00, 2019-09-28 00:00:00, 201..."
5,id_1418,Oporto,,1419.0,3.0,,2.0,"[['There are better 3 star hotel bars', 'Amazi...",/Restaurant_Review-g189180-d12503536-Reviews-D...,d12503536,"[2020-09-21 00:00:00, 2020-03-03 00:00:00, 201..."
6,id_1720,Milan,"['Italian', 'Pizza']",1722.0,4.0,$,50.0,"[['Excellent simple local eatery.', 'Excellent...",/Restaurant_Review-g187849-d5808504-Reviews-Pi...,d5808504,"[2020-02-06 00:00:00, 2019-11-09 00:00:00, 201..."
7,id_825,Bratislava,['Italian'],826.0,3.0,,9.0,"[['Wasting of money', 'excellent cuisine'], ['...",/Restaurant_Review-g274924-d3199765-Reviews-Ri...,d3199765,"[2014-02-16 00:00:00, 2012-06-13 00:00:00]"
8,id_2690,Vienna,,2692.0,4.0,,,"[[], []]",/Restaurant_Review-g190454-d12845029-Reviews-G...,d12845029,[2017-08-25 00:00:00]
9,id_4209,Rome,"['Italian', 'Pizza', 'Fast Food']",4210.0,4.0,$,55.0,"[['Clean efficient staff', 'Nice little pizza ...",/Restaurant_Review-g187791-d8020681-Reviews-Qu...,d8020681,"[2020-02-24 00:00:00, 2019-05-27 00:00:00, 201..."
