In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import math

# Отборочный этап

In [2]:
df = pd.read_csv('startup_users_visits.csv')

In [3]:
df.sample(5, random_state=0)

Unnamed: 0,user_id,date,pay
4775,01067493d81f2892,2023-04-26,False
292097,640609eaa716c09f,2023-07-07,False
370529,7ee5a7e3030e20e6,2023-09-03,False
370032,7ec43cec81bde6f9,2023-06-19,False
205160,45ba1e23a5b01dda,2023-08-05,False


In [4]:
df.shape

(373366, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373366 entries, 0 to 373365
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  373366 non-null  object
 1   date     373366 non-null  object
 2   pay      373366 non-null  bool  
dtypes: bool(1), object(2)
memory usage: 6.1+ MB


In [6]:
df['date'] = pd.to_datetime(df['date'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373366 entries, 0 to 373365
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   user_id  373366 non-null  object        
 1   date     373366 non-null  datetime64[ns]
 2   pay      373366 non-null  bool          
dtypes: bool(1), datetime64[ns](1), object(1)
memory usage: 6.1+ MB


In [8]:
border_date = df.date.max() - pd.Timedelta(90, unit='d')

In [9]:
border_date

Timestamp('2023-07-02 00:00:00')

In [10]:
df.sample(5, random_state=0)

Unnamed: 0,user_id,date,pay
4775,01067493d81f2892,2023-04-26,False
292097,640609eaa716c09f,2023-07-07,False
370529,7ee5a7e3030e20e6,2023-09-03,False
370032,7ec43cec81bde6f9,2023-06-19,False
205160,45ba1e23a5b01dda,2023-08-05,False


In [11]:
start_using = df.groupby("user_id", as_index=False).agg({'date': 'min'}).rename(columns={'date': 'min_date'})

In [12]:
start_using['date_after_90'] = start_using['min_date'] + pd.Timedelta(90, unit='d')
start_using['min_date'] = start_using['min_date'].to_numpy().astype('datetime64[M]')

In [13]:
start_using.head()

Unnamed: 0,user_id,min_date,date_after_90
0,0001599900972add,2023-06-01,2023-09-13
1,00019b04f17ade68,2022-12-01,2023-03-18
2,0001c405d5a77893,2023-07-01,2023-10-07
3,000314ab1fe38c66,2023-04-01,2023-07-21
4,0003f0dbb8dc8b65,2023-02-01,2023-05-18


In [14]:
start_using.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25648 entries, 0 to 25647
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   user_id        25648 non-null  object        
 1   min_date       25648 non-null  datetime64[ns]
 2   date_after_90  25648 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 801.5+ KB


In [15]:
full_df = df.merge(start_using.set_index('user_id'), on='user_id', how='left')

In [16]:
full_df.sample(5, random_state=0)

Unnamed: 0,user_id,date,pay,min_date,date_after_90
4775,01067493d81f2892,2023-04-26,False,2023-02-01,2023-05-11
292097,640609eaa716c09f,2023-07-07,False,2023-05-01,2023-08-27
370529,7ee5a7e3030e20e6,2023-09-03,False,2023-06-01,2023-09-08
370032,7ec43cec81bde6f9,2023-06-19,False,2023-04-01,2023-07-29
205160,45ba1e23a5b01dda,2023-08-05,False,2023-06-01,2023-09-17


In [17]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373366 entries, 0 to 373365
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   user_id        373366 non-null  object        
 1   date           373366 non-null  datetime64[ns]
 2   pay            373366 non-null  bool          
 3   min_date       373366 non-null  datetime64[ns]
 4   date_after_90  373366 non-null  datetime64[ns]
dtypes: bool(1), datetime64[ns](3), object(1)
memory usage: 14.6+ MB


In [18]:
full_df.shape

(373366, 5)

In [19]:
ans = full_df\
    .query('(pay == True) & (date <= date_after_90)')\
    .groupby(['min_date'], as_index=False)\
    .agg('count')[['min_date', 'user_id']]

In [20]:
ans

Unnamed: 0,min_date,user_id
0,2022-11-01,133
1,2022-12-01,552
2,2023-01-01,437
3,2023-02-01,949
4,2023-03-01,950
5,2023-04-01,678
6,2023-05-01,954
7,2023-06-01,848
8,2023-07-01,698
9,2023-08-01,603


In [21]:
ans.to_csv('ans_cup_2')

In [22]:
df = pd.read_csv('path.csv')

In [23]:
df

Unnamed: 0,x,y
0,487.5,802.5
1,487.5,750.0
2,484.5,682.5
3,435.0,633.0
4,402.0,592.5
...,...,...
86,1174.5,849.0
87,1167.0,783.0
88,1164.0,757.5
89,1135.5,726.0


In [24]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.x, y=df.y))

In [25]:
df['x_next'] = df['x'].shift(-1)
df['y_next'] = df['y'].shift(-1)
df['dist'] = np.sqrt((df.x - df.x_next)**2 + (df.y - df.y_next)**2)

In [26]:
df

Unnamed: 0,x,y,x_next,y_next,dist
0,487.5,802.5,487.5,750.0,52.500000
1,487.5,750.0,484.5,682.5,67.566634
2,484.5,682.5,435.0,633.0,70.003571
3,435.0,633.0,402.0,592.5,52.242224
4,402.0,592.5,387.0,541.5,53.160135
...,...,...,...,...,...
86,1174.5,849.0,1167.0,783.0,66.424769
87,1167.0,783.0,1164.0,757.5,25.675864
88,1164.0,757.5,1135.5,726.0,42.479407
89,1135.5,726.0,1126.5,703.5,24.233242


In [27]:
df.query('(dist > 80)')

Unnamed: 0,x,y,x_next,y_next,dist
6,423.0,543.0,391.5,735.0,194.566827
7,391.5,735.0,468.0,580.5,172.402146
19,547.5,816.0,435.0,735.0,138.626296
22,465.0,732.0,610.5,816.0,168.006696
26,738.0,810.0,766.5,891.0,85.867631
27,766.5,891.0,774.0,810.0,81.346481
44,856.5,789.0,1002.0,783.0,145.623659
45,1002.0,783.0,900.0,789.0,102.176318
72,1222.5,760.5,1136.5,738.0,88.894601
73,1136.5,738.0,1228.5,714.0,95.078915


In [28]:
df.dist.quantile(.8)

52.58564442887431

In [29]:
fucked = {391.5: 735, 435: 735, 448.5: 724.5, 465: 732, 766.5: 891, 1002: 783, 1136.5: 738.0}
fuckx = fucked.keys()
fucky = fucked.values()

In [30]:
cor_df = df.drop(df.query('(x in @fuckx) & (y in @fucky)').index)

In [31]:
cor_df['x_next'] = cor_df['x'].shift(-1)
cor_df['y_next'] = cor_df['y'].shift(-1)
cor_df['dist'] = np.sqrt((cor_df.x - cor_df.x_next)**2 + (cor_df.y - cor_df.y_next)**2)

In [32]:
cor_df.head(10)

Unnamed: 0,x,y,x_next,y_next,dist
0,487.5,802.5,487.5,750.0,52.5
1,487.5,750.0,484.5,682.5,67.566634
2,484.5,682.5,435.0,633.0,70.003571
3,435.0,633.0,402.0,592.5,52.242224
4,402.0,592.5,387.0,541.5,53.160135
5,387.0,541.5,423.0,543.0,36.031236
6,423.0,543.0,468.0,580.5,58.576873
8,468.0,580.5,502.5,604.5,42.026777
9,502.5,604.5,543.0,576.0,49.522722
10,543.0,576.0,570.0,529.5,53.770345


In [33]:
cor_df.query('(dist > 80)')

Unnamed: 0,x,y,x_next,y_next,dist


In [34]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=cor_df.x, y=cor_df.y))

In [35]:
round(cor_df.dist.sum() / 1000, 5)

3.18015

In [36]:
df = pd.read_csv('logs.tsv', sep='\t')


Columns (3) have mixed types.Specify dtype option on import or set low_memory=False.



In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054451 entries, 0 to 1054450
Data columns (total 5 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   timestamp     1054451 non-null  object
 1   context       1054451 non-null  object
 2   event         1054451 non-null  object
 3   operation_id  954451 non-null   object
 4   user          578265 non-null   object
dtypes: object(5)
memory usage: 40.2+ MB


In [38]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [39]:
df.shape

(1054451, 5)

In [40]:
df.sample(7, random_state=60)

Unnamed: 0,timestamp,context,event,operation_id,user
542352,2023-09-15 09:30:08.278693+00:00,[],view_info,1bcba707-9859a894-3fe03e8-9192a8c5,
922569,2023-09-15 18:06:56.301297+00:00,[],view_info,e4ab97e9-b8a95526-3fe03e8-20e83cf2,
756395,2023-09-15 14:21:01.304897+00:00,[],view_author,a2a43cd5-83f1a488-3fe03e8-f0b63c92,
1023709,2023-09-15 20:31:46.466846+00:00,[],view_info,ea7ad85-12f3347b-3fe03e8-e936245,
264143,2023-09-15 03:19:36.598611+00:00,[],like,52140d8f-3066f99e-3fe03e8-8cb2703b,user-dff111de5e07ea8b650400d23534eae0
928920,2023-09-15 18:15:01+00:00,"[""//188342153b6ecb4549599e569ad4b18a/374fcded2...",add_to_favorite,,user-74e8e994c19f071213bfc86e3e023d4f
663888,2023-09-15 12:16:50.696207+00:00,"[""//cbed9086dcbdef79a84b0cfbaf4a99fc/78df48153...",like,eb650aa-8ae13fe5-3fe03e8-eab363dc,user-9becb93908e2c039c2bd4cc9eec234b3


In [41]:
df.head(7)

Unnamed: 0,timestamp,context,event,operation_id,user
0,2023-09-14 21:00:00.307688+00:00,[],view_author,1210e2b6-71829001-3fe03e8-c96d0646,
1,2023-09-14 21:00:00.336550+00:00,[],view_item,8892e59a-9313f4dd-3fe03e8-5a5ac2d9,user-ecdb6142941176275ebfe04f4a4df9ff
2,2023-09-14 21:00:00.614214+00:00,[],view_info,8892e59a-9313f4dd-3fe03e8-5a5ac2d9,
3,2023-09-14 21:00:00.664451+00:00,"[""//188342153b6ecb4549599e569ad4b18a/def9e553f...",view_item,3cf901d-cecc0680-3fe03e8-a5d177ea,user-bbb7f161431770af4a7a4f4f53381abf
4,2023-09-14 21:00:00.730727+00:00,[],view_author,8892e59a-9313f4dd-3fe03e8-5a5ac2d9,
5,2023-09-14 21:00:00.810163+00:00,[],view_item,21e4d587-bd4a1c9d-3fe03e8-e37c5592,user-e7a586f029965d26074d9c4721fc8feb
6,2023-09-14 21:00:00.810163+00:00,[],view_item,21e4d587-bd4a1c9d-3fe03e8-e37c5592,user-e7a586f029965d26074d9c4721fc8feb


In [42]:
df.event.unique()

array(['view_author', 'view_item', 'view_info', 'like', 'dislike',
       'add_to_favorite'], dtype=object)

In [43]:
df.timestamp.min()

Timestamp('2023-09-14 21:00:00.307688+0000', tz='UTC')

In [44]:
df.timestamp.max()

Timestamp('2023-09-15 23:59:59+0000', tz='UTC')

In [45]:
df\
    .query('event == "like"')\
    .groupby('user', as_index=False)\
    .agg({'operation_id': 'count'})

Unnamed: 0,user,operation_id
0,user-0034fb412558c2cf52bd670079461111,5
1,user-004ac443ea809ef2d089bad3db5d8980,2
2,user-00815af9be58da67b1d1fb78e7bac672,1
3,user-00d589a031c10cd406e2d4275814a708,28
4,user-00e00b0c2a9077663fadaacdff96aaa8,6
...,...,...
2654,user-ffa67112f31b8638deb5271bf6e5e907,1
2655,user-ffae03424e4db8201deff2b847735543,14
2656,user-ffd12ed221553f811e8498c9c8a2503b,9
2657,user-fff96d811ee8bc57032536cdd7097bdd,8


In [46]:
df.nunique()

timestamp       849149
context         137674
event                6
operation_id    200000
user              3053
dtype: int64

In [47]:
df.context[3].strip('["]').split('","')

['//188342153b6ecb4549599e569ad4b18a/def9e553f2e08deb7856c6565b247c33/61b8c088fdd13c4a5ddefffd3e07b2d7/d659be8e01f8d56eeda39656dadacecb/05b91183fc809816e177ea38bec897b6/6aebc873bd5956631be6faf31088375b',
 '//188342153b6ecb4549599e569ad4b18a/def9e553f2e08deb7856c6565b247c33/61b8c088fdd13c4a5ddefffd3e07b2d7/d659be8e01f8d56eeda39656dadacecb/05b91183fc809816e177ea38bec897b6/d385fa07d216adb92cbcd6a530fc4a46',
 '//188342153b6ecb4549599e569ad4b18a/def9e553f2e08deb7856c6565b247c33/61b8c088fdd13c4a5ddefffd3e07b2d7/d659be8e01f8d56eeda39656dadacecb/05b91183fc809816e177ea38bec897b6/323d041c53bbb30119cc2933af2b5eee',
 '//188342153b6ecb4549599e569ad4b18a/def9e553f2e08deb7856c6565b247c33/61b8c088fdd13c4a5ddefffd3e07b2d7/d659be8e01f8d56eeda39656dadacecb/05b91183fc809816e177ea38bec897b6/46076e074e049055fb89286fdaa9a003',
 '//188342153b6ecb4549599e569ad4b18a/def9e553f2e08deb7856c6565b247c33/61b8c088fdd13c4a5ddefffd3e07b2d7/d659be8e01f8d56eeda39656dadacecb/05b91183fc809816e177ea38bec897b6/699f98847a1c919

In [48]:
df.operation_id

0          1210e2b6-71829001-3fe03e8-c96d0646
1          8892e59a-9313f4dd-3fe03e8-5a5ac2d9
2          8892e59a-9313f4dd-3fe03e8-5a5ac2d9
3           3cf901d-cecc0680-3fe03e8-a5d177ea
4          8892e59a-9313f4dd-3fe03e8-5a5ac2d9
                          ...                
1054446                                   NaN
1054447                                   NaN
1054448                                   NaN
1054449                                   NaN
1054450                                   NaN
Name: operation_id, Length: 1054451, dtype: object

In [49]:
df\
    .query('event == "like"')

Unnamed: 0,timestamp,context,event,operation_id,user
17,2023-09-14 21:00:02.310933+00:00,"[""//cbed9086dcbdef79a84b0cfbaf4a99fc/78df48153...",like,e3bc2d92-5338c0e3-3fe03e8-d6655699,user-7aaced7dab4dda5e7c40158460ec6ee9
18,2023-09-14 21:00:02.313745+00:00,[],like,ba88ea60-3f275092-3fe03e8-766a15c7,user-03ab3db58f15ac99a216e2bd1981b6ce
19,2023-09-14 21:00:02.335753+00:00,"[""//188342153b6ecb4549599e569ad4b18a/b63102612...",like,d437d723-855bcf00-3fe03e8-923171bd,user-fc8efb7c39914bac1be8a708e394dfb8
20,2023-09-14 21:00:02.349759+00:00,[],like,42855d18-75740dde-3fe03e8-45612f4,user-b6247671de849561d00d6574fa61f5d8
21,2023-09-14 21:00:02.370884+00:00,"[""//cbed9086dcbdef79a84b0cfbaf4a99fc/64bc3844f...",like,91f8ba0a-eaf1985-3fe03e8-cf4f1dfd,user-60ede91298d3697e57f08d736358ab4d
...,...,...,...,...,...
1042432,2023-09-15 20:59:50.802709+00:00,"[""//188342153b6ecb4549599e569ad4b18a/def9e553f...",like,677e2043-c1161a11-3fe03e8-99b0ca86,user-bbb7f161431770af4a7a4f4f53381abf
1042449,2023-09-15 20:59:52.230068+00:00,[],like,4e3c2797-168b7556-3fe03e8-7897c0a5,user-03ab3db58f15ac99a216e2bd1981b6ce
1042454,2023-09-15 20:59:52.379823+00:00,[],like,a4d22466-7a69aaf5-3fe03e8-4f9c34ce,user-e3c57d4ba5cca4d1733ddf978578405a
1042488,2023-09-15 20:59:55.222299+00:00,"[""//cbed9086dcbdef79a84b0cfbaf4a99fc/64bc3844f...",like,219dc29e-6f1ee19f-3fe03e8-fcc77190,user-4f006f46b5020b1752c45ee1bfc779a9


In [50]:
rng = np.random.default_rng()
for _ in range(18):
    print(*sorted(rng.integers(low=0, high=235275/2, size=3), reverse=True), sep=',')

61516,32909,5695
93039,91301,34106
116979,92231,22725
113261,18188,16452
60463,33736,22872
71080,45389,22977
108547,87431,39932
103385,86831,73636
56210,39507,36631
115458,78524,30292
76743,57174,36172
89148,69459,59474
114524,15432,12223
94049,24775,9368
57013,31724,9656
71952,23978,23813
65774,38695,4820
108782,75377,32683


In [51]:
# n = int(input())
# mas = tuple(map(int, input().split()))

# mas = sorted(mas, reverse=True)
# f = 0
# # print(mas)
# for i in range(n):
#     if mas[i] < (i+1)**2:
#         print(i)
#         f = 1
#         break
        
# if f == 0:
#     print(n)

# Полуфинал

order - идентификатор поездки 

driver - идентификатор художника

user - идентификатор компаньона художника

zone - зона поездки

coupon - краткое название промокода поездки

currency - валюта поездки 

fact_km - фактическое расстояние поездки

plan_km - ожидаемое расстояние поездки

plan_time - ожидаемое время поездки

In [52]:
df = pd.read_csv('data_fix')

In [53]:
df.sample(5, random_state=0)

Unnamed: 0,coupon,currency,driver,fact_km,order,plan_km,plan_time,user,zone
11641,,RUB,54299147,2.362,3078564,2.425707,6.2,2810569,sergievposad
5982,,RUB,11213429,,8635223,0.118157,6.1,8531153,moscow
7115,,RUB,25217205,31.264,7311474,30.93376,43.1,71803559,moscow
1819,,RUB,96011964,,88965538,23.870702,38.15,13306124,odincovo
7288,,RUB,20999223,11.595,30873776,6.928062,11.9,19404799,moscow


In [54]:
df.shape

(17368, 9)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17368 entries, 0 to 17367
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   coupon     317 non-null    object 
 1   currency   17368 non-null  object 
 2   driver     17368 non-null  int64  
 3   fact_km    12705 non-null  float64
 4   order      17368 non-null  int64  
 5   plan_km    17368 non-null  float64
 6   plan_time  17368 non-null  float64
 7   user       17368 non-null  int64  
 8   zone       17368 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 1.2+ MB


In [56]:
df.nunique()

coupon           2
currency         2
driver         199
fact_km       9730
order        17366
plan_km      16867
plan_time     3580
user         15719
zone            58
dtype: int64

In [57]:
df.zone.value_counts()

moscow              11559
podolsk               544
narofominsk           510
himki                 506
lyberci               464
pushkino              418
krasnogorsk           359
voskresensk           316
serpukhov             253
odincovo              251
mytishchi             176
balashiha             162
reutov                145
dolgoprudniy          138
vidnoe                124
svo                   119
pavlovskyposad        105
sergievposad          105
troick                 95
kotelniki              88
chehov                 81
dzerzhinsky            75
domodedovo             73
ramenskoe              69
vko                    67
dme                    67
nakhabino              62
zheleznodorozhny       55
korolev                50
zelenograd             33
monino                 32
zvenigorod             31
lobnja                 31
obninsk                27
electrostal            25
aprelevka              24
mikhnevo               19
istra                  16
stupino     

In [58]:
df.query('fact_km == 0').driver.unique()

array([89732334, 12197671, 60912933,  3076344, 83844849, 27314745,
       29119385,   782327, 10832805, 98111392, 56872166,  2233476])

In [59]:
df.currency.unique()

array(['RUB', 'USD'], dtype=object)

In [60]:
df.query('currency == "USD"')

Unnamed: 0,coupon,currency,driver,fact_km,order,plan_km,plan_time,user,zone
16184,,USD,87453549,,9218035,0.616003,3.366667,99251227,moscow


In [61]:
df.coupon.unique()

array([nan, 'micro_code', 'free_ride'], dtype=object)

In [62]:
df.query('coupon == "free_ride"').driver

16671    39939250
16672    39939250
16673    39939250
16674    39939250
16675    39939250
           ...   
16900    39939250
16901    39939250
16902    39939250
16903    39939250
16904    39939250
Name: driver, Length: 234, dtype: int64

In [63]:
df.query('coupon == "micro_code"').nunique()

coupon        1
currency      1
driver       59
fact_km      67
order        83
plan_km      83
plan_time    83
user         81
zone         17
dtype: int64

In [64]:
df.groupby('zone', as_index=False)\
    .agg({'driver': 'nunique', 'user': 'count'})\
    .sort_values(['driver', 'user'], ascending = [True, False])

Unnamed: 0,zone,driver,user
34,novosibirsk,1,7
45,sochi,1,7
3,chelyabinsk,1,6
51,tuchkovo,1,3
18,kostroma,1,2
30,mozhaysk,1,2
37,orehovozuevo,1,2
47,spb,1,2
15,kaluga_region,1,1
16,klin,1,1


In [65]:
drivers_all = df.driver.unique()
drivers_mosckow = df.query('zone == "moscow"').driver.unique()

In [66]:
for i in drivers_all:
    if i not in drivers_mosckow:
        print(i)

59986309


In [67]:
df.query('zone == "novosibirsk"').driver.value_counts()

68102783    7
Name: driver, dtype: int64

In [68]:
df.query('zone == "pavlovskyposad"').driver.value_counts()

22880700    102
34460884      3
Name: driver, dtype: int64

In [69]:
df.query('zone == "sergievposad"').driver.value_counts()

54299147    104
2630481       1
Name: driver, dtype: int64

In [70]:
df.query('zone == "voskresensk"').driver.value_counts()

46629985    313
65014768      2
20999223      1
Name: driver, dtype: int64

In [71]:
df.query('zone == "serpukhov"').driver.value_counts()

12700954    153
68130991     96
20999223      3
59986309      1
Name: driver, dtype: int64

In [72]:
df.query('zone == "narofominsk"').driver.value_counts()

67724939    279
39939250    216
2233476       7
81028125      3
43857106      2
76028546      2
26025772      1
Name: driver, dtype: int64

In [73]:
df.query('zone == "moscow"').driver.value_counts()

56872166    430
36098567    252
22643084    228
82646235    199
46051150    182
           ... 
87453549      1
76456337      1
11808557      1
9599990       1
90985908      1
Name: driver, Length: 198, dtype: int64

In [74]:
zone_count = df.zone.value_counts()

In [75]:
# for i in df.zone.unique():
#     print(i, zone_count[i], df.query(f'zone == "{i}"').driver.value_counts(), sep='\n')
#     print('-------------')

In [76]:
df.head()

Unnamed: 0,coupon,currency,driver,fact_km,order,plan_km,plan_time,user,zone
0,,RUB,17144870,,34341133,9.012953,10.783333,14531103,moscow
1,,RUB,17721342,,7178069,1.518001,6.583333,8019378,moscow
2,,RUB,94414125,9.427,846666,9.88432,22.016667,15232981,moscow
3,,RUB,7615016,45.956,87372383,44.179568,55.8,98174968,moscow
4,,RUB,1459241,11.256,155631,9.57312,15.8,7303325,moscow


In [77]:
df.shape

(17368, 9)

In [78]:
df.nunique()

coupon           2
currency         2
driver         199
fact_km       9730
order        17366
plan_km      16867
plan_time     3580
user         15719
zone            58
dtype: int64

In [79]:
df.groupby('plan_km', as_index=False).agg({'plan_time': 'count'}).sort_values('plan_time', ascending=False)

Unnamed: 0,plan_km,plan_time
6859,5.000000,443
0,0.000000,12
397,0.951463,5
10713,10.179089,3
3961,3.036258,3
...,...,...
5639,4.053311,1
5640,4.053389,1
5641,4.053817,1
5642,4.054414,1


In [80]:
df.describe()

Unnamed: 0,driver,fact_km,order,plan_km,plan_time,user
count,17368.0,12705.0,17368.0,17368.0,17368.0,17368.0
mean,41687040.0,13.692452,39292730.0,11.637259,20.470719,40336620.0
std,31475540.0,16.073378,31703220.0,16.381456,19.471966,32030600.0
min,69542.0,0.0,585.0,0.0,0.0,331.0
25%,8350262.0,3.684,7837021.0,3.237557,8.35,8356656.0
50%,39939250.0,7.735,34604750.0,6.395879,14.2,36978560.0
75%,68130990.0,18.018,67072420.0,14.487683,26.266667,68120750.0
max,99717060.0,249.905,99996600.0,1000.0,459.716667,99990210.0


In [81]:
df.order.value_counts()

1200850     2
40212259    2
4158123     1
88459948    1
88195757    1
           ..
1953232     1
32185609    1
6097528     1
3597574     1
1802240     1
Name: order, Length: 17366, dtype: int64

In [82]:
df.query('order in (1200850, 40212259)')

Unnamed: 0,coupon,currency,driver,fact_km,order,plan_km,plan_time,user,zone
1355,,RUB,10832805,,1200850,6.195091,17.9,98458859,moscow
3996,,RUB,56829931,1.371,40212259,1.375968,3.016667,45631992,odincovo
5097,,RUB,68827832,,40212259,13.505963,23.283333,33261066,krasnogorsk
5705,,RUB,6491296,31.317,1200850,27.824533,33.083333,837448,moscow


In [83]:
drivers = df.driver

In [84]:
for i in drivers:
    if i in df.user:
        print(i)

In [85]:
plan_km_time = df[['plan_km', 'plan_time', 'driver']].query('plan_km > 0').sort_values('plan_km')

In [86]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=plan_km_time.plan_km, y=plan_km_time.plan_time))

In [87]:
plan_km_time.query('(plan_km > 500 & plan_time < 20) | (150 < plan_km < 170 & plan_time > 200)')

Unnamed: 0,plan_km,plan_time,driver
3698,161.999592,459.716667,27314745
16211,1000.0,11.366667,59986309


In [88]:
plan_fact_km = df[['plan_km', 'fact_km', 'driver']].query('plan_km > 0').dropna(subset=['fact_km']).sort_values('plan_km')

In [89]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=plan_fact_km.plan_km, y=plan_fact_km.fact_km))

In [90]:
plan_time_fact_km = df[['plan_time', 'fact_km', 'driver']].sort_values('fact_km')

In [91]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=plan_time_fact_km.fact_km, y=plan_time_fact_km.plan_time))

In [92]:
df.query('2 > fact_km > 0 & plan_time > 400')

Unnamed: 0,coupon,currency,driver,fact_km,order,plan_km,plan_time,user,zone
13189,,RUB,74391989,1.864,945696,521.17445,459.1,92196558,moscow


In [93]:
fucked = plan_fact_km.query('(plan_km > 500 & fact_km < 2) | (8 < plan_km < 100 & fact_km > 150)')
fucked
# plan_fact_km.query('(plan_km > 500 & fact_km < 2) | (8 < plan_km < 100 & (fact_km > 150 | 0 < fact_km < 1))')

Unnamed: 0,plan_km,fact_km,driver
12565,9.182507,245.514,27318902
9634,14.039035,207.183,94883216
4790,14.111042,159.292,66397037
8935,15.667069,249.905,2110926
6369,22.948471,210.657,75732898
3902,25.056732,174.99,125920
9961,57.594734,169.782,44595728
13189,521.17445,1.864,74391989
16211,1000.0,0.01,59986309


In [94]:
fucked[['driver']].to_csv('cup_semi_1', index=False)

In [95]:
fucked[['driver']]

Unnamed: 0,driver
12565,27318902
9634,94883216
4790,66397037
8935,2110926
6369,75732898
3902,125920
9961,44595728
13189,74391989
16211,59986309


In [96]:
df = pd.read_csv('people_by_minute.csv')

In [97]:
df.sample(5, random_state=0)

Unnamed: 0,time,average_people
15,12:15,2.763315
124,14:04,9.599416
141,14:21,5.104985
263,16:23,3.111639
170,14:50,6.106679


In [98]:
df.shape

(480, 2)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   time            480 non-null    object 
 1   average_people  480 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.6+ KB


In [100]:
df['wait'] = df.average_people / 2

In [101]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.time, y=df.average_people))

In [102]:
df.sort_values('average_people', ascending=False).head()

Unnamed: 0,time,average_people,wait
58,12:58,11.848807,5.924404
59,12:59,11.822528,5.911264
57,12:57,11.797285,5.898642
60,13:00,11.717314,5.858657
56,12:56,11.679292,5.839646


In [103]:
df.agg({'average_people':'max'})

average_people    11.848807
dtype: float64

In [104]:
times = df.time

In [105]:
m = df.wait[0]
ind = 0
t = 0
for i, time in enumerate(times):
    if i + 15 < len(times)-1:
#         if df.average_people[i] < df.average_people[i+15]:
        diff = df.wait[i+15] - df.wait[i]
        if diff > m:
            m = diff
            ind = i
            t = time

            print(m, i, time, times[i+15], df.wait[i], df.wait[i+15])       
        

1.2761712962962957 0 12:00 12:15 0.10548611111111164 1.3816574074074073
1.3920666666666663 1 12:01 12:16 0.1366296296296298 1.5286962962962962
1.5054212962962963 2 12:02 12:17 0.17179166666666704 1.6772129629629633
1.6149907407407407 3 12:03 12:18 0.21097222222222214 1.8259629629629628
1.7195305555555556 4 12:04 12:19 0.2541712962962964 1.973701851851852
1.8177962962962972 5 12:05 12:20 0.30138888888888854 2.1191851851851857
1.9085435185185187 6 12:06 12:21 0.35262499999999974 2.2611685185185184
1.9905277777777788 7 12:07 12:22 0.4078796296296288 2.3984074074074075
2.0313611111111123 8 12:08 12:23 0.49829629629629546 2.5296574074074076
2.054245061728395 9 12:09 12:24 0.600146296296296 2.654391358024691
2.0606141975308647 10 12:10 12:25 0.7121851851851848 2.7727993827160495


In [106]:
df.iloc[:50]

Unnamed: 0,time,average_people,wait
0,12:00,0.210972,0.105486
1,12:01,0.273259,0.13663
2,12:02,0.343583,0.171792
3,12:03,0.421944,0.210972
4,12:04,0.508343,0.254171
5,12:05,0.602778,0.301389
6,12:06,0.70525,0.352625
7,12:07,0.815759,0.40788
8,12:08,0.996593,0.498296
9,12:09,1.200293,0.600146


In [107]:
df.average_people[0] - df.average_people[0+15]

-2.5523425925925913

In [108]:
def prob(lamda):
    p = 0
    for k in np.arange(11):
        p += ((lamda) ** k) / np.math.factorial(k) * np.exp(-lamda)
#         print((lamda ** k) / np.math.factorial(k) * np.exp(-lamda))
    return p

In [109]:
np.arange(11)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [110]:
avg_pep = df.wait * 4 / 3

In [111]:
for i, num in enumerate(avg_pep):
    f = 0
    for j in range(i, 480):
        p = prob(avg_pep[j])
        if p < 0.9:
            f = 1
            break
    if f:
        pass
    else:
        print(df.time[i], i)
        break

13:05 65


In [112]:
probs = np.array(prob(avg_pep))

In [113]:
# for i, num in enumerate(avg_pep):
#     f = 0
#     for j in range(i, len(probs)):
#         p = probs[j]
#         if p < 0.9:
#             f = 1
#             break
#     if f:
#         pass
#     else:
#         print(df.time[i])

In [114]:
df['prob'] = prob(df.wait * 4/3)

In [115]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.time, y=df.prob))

In [116]:
def shit(n):
    return 1 - np.exp(-4 * 30 / (n * 60))

In [117]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.time, y=shit(df.wait)))