In [1]:
import pandas as pd
import math
import numpy as np
import re

## Загружаем данные

In [2]:
raw_data=pd.read_csv('Data.csv')

In [3]:
raw_data['InvoiceDate'] = pd.to_datetime(raw_data['InvoiceDate'], format='%Y-%m-%d') # Убираем время за ненадобностью
raw_data['InvoiceDate'] = raw_data['InvoiceDate'].dt.strftime('%Y-%m-%d')

In [4]:
raw_data

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
944463,575312,22083,PAPER CHAIN KIT RETROSPOT,6,2011-11-09,2.95,13588.0,United Kingdom
944464,575312,23355,HOT WATER BOTTLE KEEP CALM,4,2011-11-09,4.95,13588.0,United Kingdom
944465,575312,22110,BIRD HOUSE HOT WATER BOTTLE,6,2011-11-09,2.55,13588.0,United Kingdom
944466,575312,22037,ROBOT BIRTHDAY CARD,12,2011-11-09,0.42,13588.0,United Kingdom


## Интересующее событие - количество купленных товаров в чеках каждого покупателя в первые три дня после новогодних праздников,
## Событие наблюдение - страна, где сделана покупка

In [5]:
df=raw_data[raw_data.InvoiceDate.str.contains(r'20\d\d-01-0[4-6]', regex= True, na=False)]  # Выбираем даты только с 4 по 6 янаря
#Выбраны именно даты с 4 по 6 января, так как с 1 по 3 января данных нет

In [6]:
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
44722,493410,TEST001,This is a test product.,5,2010-01-04,4.50,12346.0,United Kingdom
44723,C493411,21539,RETRO SPOTS BUTTER DISH,-1,2010-01-04,4.25,14590.0,United Kingdom
44724,493412,TEST001,This is a test product.,5,2010-01-04,4.50,12346.0,United Kingdom
44725,493413,21724,PANDA AND BUNNIES STICKER SHEET,1,2010-01-04,0.85,,United Kingdom
44726,493413,84578,ELEPHANT TOY WITH BLUE T-SHIRT,1,2010-01-04,3.75,,United Kingdom
...,...,...,...,...,...,...,...,...
543115,540397,22178,VICTORIAN GLASS HANGING T-LIGHT,6,2011-01-06,1.25,13187.0,United Kingdom
543116,540397,22173,METAL 4 HOOK HANGER FRENCH CHATEAU,1,2011-01-06,2.95,13187.0,United Kingdom
543117,540397,84406B,CREAM CUPID HEARTS COAT HANGER,1,2011-01-06,3.25,13187.0,United Kingdom
543118,540397,22794,SWEETHEART WIRE MAGAZINE RACK,1,2011-01-06,7.95,13187.0,United Kingdom


In [7]:
df_group=df.groupby(['Customer ID', 'Country']).agg({'Quantity': ['sum']}). reset_index()  #Группируем по ID покупателя
df_group.columns = df_group.columns.droplevel(1)

In [8]:
df_group

Unnamed: 0,Customer ID,Country,Quantity
0,12346.0,United Kingdom,10
1,12386.0,Australia,140
2,12395.0,Belgium,-1
3,12413.0,France,177
4,12415.0,Australia,4662
...,...,...,...
201,18109.0,United Kingdom,51
202,18145.0,United Kingdom,676
203,18171.0,United Kingdom,96
204,18226.0,United Kingdom,366


In [9]:
df_group['Quantity']=(df_group['Quantity']//100+1)*100  # Разбиваем количество товаров в чеке на интервалы с шагом 100
df_group

Unnamed: 0,Customer ID,Country,Quantity
0,12346.0,United Kingdom,100
1,12386.0,Australia,200
2,12395.0,Belgium,0
3,12413.0,France,200
4,12415.0,Australia,4700
...,...,...,...
201,18109.0,United Kingdom,100
202,18145.0,United Kingdom,700
203,18171.0,United Kingdom,100
204,18226.0,United Kingdom,400


In [10]:
# Создаём датафреймы отдельно для уникальных стран и для интервалов количества товаров в чеках
countries=df_group['Country'].unique()
quantities=df_group['Quantity'].unique()

df_countries=pd.DataFrame(data=countries, columns=['Country'])
df_countries['key']=0
df_quantities=pd.DataFrame(data=quantities, columns=['Quantity'])
df_quantities['key']=0

In [11]:
# Создаёмя с помощью cross join таблицу совместного распределения
prob=df_countries.merge(df_quantities, on='key', how='outer')
prob['Probability']=0

In [12]:
df_len=len(df_group)  # Общее число опытов

In [13]:
# Функция, подсчитывающая количество строк для определённой страны и количества товаров в чеке
def count_row(df_group, country, quantity):
    df_temp=df_group[(df_group['Country']==country)&(df_group['Quantity']==quantity)]
    return len(df_temp)

In [15]:
# Заполняем таблицу вероятностей
for i, row in prob.iterrows():
    temp_country=row['Country']
    temp_quantity=row['Quantity']
    result=count_row(df_group, temp_country, temp_quantity)
    prob.iat[i, 3] = result/df_len

## Результат расчётов (для наглядности взяты только значения больше 0.001)

In [16]:
prob.drop(['key'], axis=1, inplace = True)
prob=prob.sort_values('Probability', ascending=False)
prob[prob['Probability']>=0.001]

Unnamed: 0,Country,Quantity,Probability
0,United Kingdom,100,0.199029
1,United Kingdom,200,0.179612
4,United Kingdom,300,0.116505
2,United Kingdom,0,0.097087
5,United Kingdom,400,0.067961
14,United Kingdom,600,0.053398
12,United Kingdom,500,0.053398
6,United Kingdom,700,0.019417
20,United Kingdom,800,0.019417
98,Germany,0,0.014563
