# Исследовательский анализ данных

In [1]:
%%capture no-display
!pip install h3 geohash2 folium h3-py

In [2]:
import pandas as pd
import numpy as np

import os

import geohash2
import h3
import folium

In [46]:
geohash = "u6sc4"
lat, lon = geohash2.decode(geohash)
print(lat, lon)

# Resolution level (e.g., 8)
resolution = 8

# Convert latitude and longitude to H3 hash
h3_hash = h3.geo_to_h3(float(lat), float(lon), resolution)

59.3 18.


In [47]:
def generate_h3_hexagons(lat, lon, resolution):
    hexagons = h3.k_ring(h3.geo_to_h3(lat, lon, resolution), 1)
    return hexagons

center = [float(lat), float(lon)]
m = folium.Map(location=center, zoom_start=10)

resolution = 5  # Adjust this to your desired resolution
hexagons = generate_h3_hexagons(center[0], center[1], resolution)

for hexagon in hexagons:
    vertices = h3.h3_to_geo_boundary(hexagon, geo_json=True)
    folium.Polygon(locations=vertices, color='blue', fill=True, fill_color='blue').add_to(m)

m

In [34]:
# Specify the latitude and longitude for the center of your area
center_lat = 1.5
center_lon = 103.8

# Specify the resolution (higher resolution gives smaller hexagons)
resolution = 7

# Generate a hexagon at the specified location
hexagon = h3.geo_to_h3(center_lat, center_lon, resolution)

# Get the vertices of the hexagon
vertices = h3.h3_to_geo_boundary(hexagon)

# Create a folium map centered around the specified coordinates
m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# Add the hexagon as a polygon to the map
folium.Polygon(locations=vertices, color="blue", fill=True, fill_color="blue").add_to(m)

# Display the map in the Colab notebook
m

## 1 Подготовка данных

### 1.1 Загрузка данных

In [6]:
PATH = 'data/'

In [7]:
os.listdir(PATH)

['orders_se.txt',
 'orders_se_test.txt',
 'orders_se_train.txt',
 'products_se.txt',
 'vendors_se.txt']

In [8]:
def get_data(path: str) -> pd.DataFrame:
    data = pd.read_csv(path, index_col=0)
    
    display(data.head(3))
    print('Data shape:', data.shape)
    
    return data

In [9]:
data_products = get_data(os.path.join(PATH, 'products_se.txt'))

Unnamed: 0,vendor_id,product_id,name,unit_price
0,e21306e2,055e72373069,Small,0.496
1,7b111c3f,ec33db14a2c5,Alcachofas Empinzadas,0.304
2,4dc46e0f,fa04d9e5b964,Coca-Cola 2L,0.24


Data shape: (111046, 4)


In [10]:
data_vendors = get_data(os.path.join(PATH, 'vendors_se.txt'))

Unnamed: 0,vendor_id,chain_id,geohash,primary_cuisine
0,a5028ec5,,u6sc9,thai
1,fed855cd,,u6sc7,thai
2,5f4ad3ab,,u6sc8,sushi


Data shape: (1148, 4)


In [11]:
data_orders = get_data(os.path.join(PATH, 'orders_se.txt'))

Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day
0,90a4e98622,u6sc4,0,e1f3e4a4,9971ae2cd1ba,3,16:00:16,11 days
1,90a4e98622,u6sc4,0,e1f3e4a4,00734c4b351f,3,16:00:16,11 days
2,90a4e98622,u6sc4,1,5d1b1300,9a2b00f39640,1,16:34:04,51 days


Data shape: (756738, 8)


In [12]:
data_orders_train = get_data(os.path.join(PATH, 'orders_se_train.txt'))

Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day
0,90a4e98622,u6sc4,0,e1f3e4a4,9971ae2cd1ba,3,16:00:16,11 days
1,90a4e98622,u6sc4,0,e1f3e4a4,00734c4b351f,3,16:00:16,11 days
2,90a4e98622,u6sc4,1,5d1b1300,9a2b00f39640,1,16:34:04,51 days


Data shape: (617196, 8)


In [13]:
data_orders_test = get_data(os.path.join(PATH, 'orders_se_test.txt'))

Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day
3,90a4e98622,u6sc4,2,5d1b1300,9a2b00f39640,6,14:39:12,84 days
17,1c2b4598db,u6sc9,11,262f6435,ae0b5cbf8dd1,6,17:10:04,84 days
18,1c2b4598db,u6sc9,11,262f6435,674d25744130,6,17:10:04,84 days


Data shape: (139542, 8)


In [14]:
data_orders_train.shape[0] + data_orders_test.shape[0] == data_orders.shape[0]

True

In [15]:
len(set(list(data_orders_train.index) + list(data_orders_test.index))) == data_orders.shape[0]

True

### 1.2 Объединение данных

In [16]:
data_vendors.columns = [col if col not in ['geohash'] else f'{col}_vendor' for col in data_vendors.columns]
data_vendors.columns

Index(['vendor_id', 'chain_id', 'geohash_vendor', 'primary_cuisine'], dtype='object')

In [17]:
data_products_vendors = data_products.merge(data_vendors, on='vendor_id') \
                                     .drop('vendor_id', axis=1)
data_products_vendors.head(3)

Unnamed: 0,product_id,name,unit_price,chain_id,geohash_vendor,primary_cuisine
0,055e72373069,Small,0.496,,u6scd,pizza
1,08606ad2b6e0,Coca-Cola Zero 33 cl,0.076,,u6scd,pizza
2,bcd1a2e2c499,Kvällsmys 2,1.0,,u6scd,pizza


In [18]:
def merge_data_with_products_and_vendors(df: pd.DataFrame, 
                                         df_to_merge: pd.DataFrame,
                                         on_col: str) -> pd.DataFrame:
    
    result = df.merge(df_to_merge, on=on_col)
    
    display(result.head(3))
    print('Data shape:', result.shape)
    
    return result

In [19]:
data_train = merge_data_with_products_and_vendors(
    data_orders_train, data_products_vendors, 'product_id'
)

Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,geohash_vendor,primary_cuisine
0,90a4e98622,u6sc4,0,e1f3e4a4,9971ae2cd1ba,3,16:00:16,11 days,Coca-Cola original taste 33 cl,0.152,3ed908e5,u6sc5,italienskt
1,3f76a09dc3,u6scj,3648,e1f3e4a4,9971ae2cd1ba,0,15:21:39,57 days,Coca-Cola original taste 33 cl,0.152,3ed908e5,u6sc5,italienskt
2,0d6785f3d2,u6sbv,6300,e1f3e4a4,9971ae2cd1ba,2,16:21:32,52 days,Coca-Cola original taste 33 cl,0.152,3ed908e5,u6sc5,italienskt


Data shape: (617196, 13)


In [20]:
data_test = merge_data_with_products_and_vendors(
    data_orders_test, data_products_vendors, 'product_id'
)

Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,geohash_vendor,primary_cuisine
0,90a4e98622,u6sc4,2,5d1b1300,9a2b00f39640,6,14:39:12,84 days,Pad Thai,0.64,f782a3fc,u6sc6,asiatiskt
1,ee38cbae98,u6sc2,4060,5d1b1300,9a2b00f39640,5,17:54:39,83 days,Pad Thai,0.64,f782a3fc,u6sc6,asiatiskt
2,2ece94e7e9,u6sc3,4157,5d1b1300,9a2b00f39640,2,17:20:34,80 days,Pad Thai,0.64,f782a3fc,u6sc6,asiatiskt


Data shape: (139542, 13)


## 2 Общий анализ данных

In [21]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617196 entries, 0 to 617195
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   customer_id      617196 non-null  object 
 1   geohash          617196 non-null  object 
 2   order_id         617196 non-null  int64  
 3   vendor_id        617196 non-null  object 
 4   product_id       617196 non-null  object 
 5   day_of_week      617196 non-null  int64  
 6   order_time       617196 non-null  object 
 7   order_day        617196 non-null  object 
 8   name             617196 non-null  object 
 9   unit_price       617196 non-null  float64
 10  chain_id         295833 non-null  object 
 11  geohash_vendor   617196 non-null  object 
 12  primary_cuisine  617196 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 61.2+ MB


In [28]:
data_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
order_id,617196.0,199998.939128,115466.216446,0.0,99876.0,199947.5,299943.75,399999.0
day_of_week,617196.0,3.318369,1.954286,0.0,2.0,4.0,5.0,6.0
unit_price,617196.0,0.42204,0.252432,0.0,0.16,0.44,0.596,1.0


In [22]:
data_train.isna().sum()

customer_id             0
geohash                 0
order_id                0
vendor_id               0
product_id              0
day_of_week             0
order_time              0
order_day               0
name                    0
unit_price              0
chain_id           321363
geohash_vendor          0
primary_cuisine         0
dtype: int64

In [23]:
print('Total duplicates:', data_train.duplicated().sum())

Total duplicates: 12610


In [29]:
data_train.iloc[data_train.duplicated()[data_train.duplicated() == True].index]['customer_id'].value_counts()

customer_id
968dea7d53    19
8e11194409    15
bd0fbe1e28    15
a5774f4a01    14
8e8e6613d9    13
              ..
5e2b3a839c     1
af2b216ff8     1
f219dd8ed7     1
0ae188c34c     1
2b9b43ad47     1
Name: count, Length: 9116, dtype: int64

In [25]:
print(data_products['name'].unique())

['Small' 'Alcachofas Empinzadas' 'Coca-Cola 2L' ...
 'Jätterulle med tjockkorv' 'Bruschetta Gratinata' 'Simpsons Pizza']


## 3 Предобработка данных

## 4 Исследовательский анализ данных