In [18]:
from scipy.stats import ttest_ind, pearsonr, spearmanr, mannwhitneyu
from warnings import filterwarnings as wfw
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rm
import scipy.stats as st
import seaborn as sns

In [19]:
orders = pd.read_csv('orders.csv', parse_dates=['Departure Date', 'Arrival Date', 'Created Date'])
orders.columns = ['user_id', 'order_id', 'transaction_value', 'commission', 'processing_cost',
                  'promocode_cost', 'intergation_cost', 'destination_id', 'departure_date',
                  'arrival_date', 'order_created_date']

users = pd.read_csv('users.csv', parse_dates=['Created Date'])
users.columns = ['user_id', 'region_id', 'user_created_date', 'language', 'role', 'status']

region = pd.read_csv('region.csv')
region.columns = ['id', 'name', 'level', 'language']

---

In [20]:
orders.head(4)

Unnamed: 0,user_id,order_id,transaction_value,commission,processing_cost,promocode_cost,intergation_cost,destination_id,departure_date,arrival_date,order_created_date
0,121100,1110022,5547,632.358,245.7321,150.0,137.5656,413,2020-03-30,2020-03-11,2020-01-10
1,121100,1110001,5836,490.224,258.5348,50.0,,1132,2020-09-03,2020-08-17,2020-06-06
2,121100,1110004,5181,538.824,241.4346,100.0,123.8259,173,2020-02-11,2020-01-14,2019-12-06
3,121100,1110010,3653,292.24,172.7869,50.0,,1098,2020-03-18,2020-03-03,2020-02-28


In [21]:
orders[['transaction_value', 'commission', 'processing_cost', 'promocode_cost', 'intergation_cost']] = \
orders[['transaction_value', 'commission', 'processing_cost', 'promocode_cost', 'intergation_cost']].fillna(0)
orders['marketing_cost'] = orders.processing_cost + orders.promocode_cost + orders.intergation_cost

In [22]:
orders.drop(columns=['processing_cost', 'promocode_cost', 'intergation_cost', 'destination_id', 'departure_date', 'arrival_date'], inplace=True)

In [23]:
orders = orders.reindex(columns=['order_id', 'order_created_date', 'user_id',  'transaction_value', 'commission',
                                 'marketing_cost'])
orders

Unnamed: 0,order_id,order_created_date,user_id,transaction_value,commission,marketing_cost
0,1110022,2020-01-10,121100,5547,632.358,533.2977
1,1110001,2020-06-06,121100,5836,490.224,308.5348
2,1110004,2019-12-06,121100,5181,538.824,465.2605
3,1110010,2020-02-28,121100,3653,292.240,222.7869
4,1110013,2019-12-05,121100,4772,391.304,250.5300
...,...,...,...,...,...,...
48702,1279726,2021-04-13,131156,6415,551.690,432.2970
48703,1279727,2021-06-26,131156,6127,618.827,470.4421
48704,1279729,2021-02-11,131156,4825,410.125,400.2950
48705,1279731,2021-06-19,131156,2763,320.508,143.1234


In [24]:
region.head(4)

Unnamed: 0,id,name,level,language
0,3,Abenberg,V,de
1,5,Abensberg,V,de
2,9,Augustusburg,V,de
3,11,Adelsheim,V,de


In [25]:
region = region.drop(columns=['name', 'level'])
region

Unnamed: 0,id,language
0,3,de
1,5,de
2,9,de
3,11,de
4,15,de
...,...,...
498,1449,no
499,1451,no
500,1454,no
501,1458,no


In [26]:
users.head(4)

Unnamed: 0,user_id,region_id,user_created_date,language,role,status
0,121100,755,2019-11-28,en,master,deleted
1,121102,1174,2020-04-02,fr,finance,deleted
2,121105,1267,2021-03-09,se,finance,active
3,121108,763,2019-04-05,en,manager,active


In [27]:
users = users.drop(columns=['region_id', 'role', 'status'])
users

Unnamed: 0,user_id,user_created_date,language
0,121100,2019-11-28,en
1,121102,2020-04-02,fr
2,121105,2021-03-09,se
3,121108,2019-04-05,en
4,121110,2021-10-04,en
...,...,...,...
4995,131150,2019-09-02,de
4996,131151,2020-09-29,en
4997,131152,2021-09-22,en
4998,131154,2019-04-17,en


---

In [30]:
dataframe_to_calculate = pd.merge(orders, 
                                  users,
                                  how='left', left_on='user_id', right_on='user_id', 
                                  suffixes = ('_or', '_us'))

dataframe_to_calculate

Unnamed: 0,order_id,order_created_date,user_id,transaction_value,commission,marketing_cost,user_created_date,language
0,1110022,2020-01-10,121100,5547,632.358,533.2977,2019-11-28,en
1,1110001,2020-06-06,121100,5836,490.224,308.5348,2019-11-28,en
2,1110004,2019-12-06,121100,5181,538.824,465.2605,2019-11-28,en
3,1110010,2020-02-28,121100,3653,292.240,222.7869,2019-11-28,en
4,1110013,2019-12-05,121100,4772,391.304,250.5300,2019-11-28,en
...,...,...,...,...,...,...,...,...
48702,1279726,2021-04-13,131156,6415,551.690,432.2970,2021-01-02,ua
48703,1279727,2021-06-26,131156,6127,618.827,470.4421,2021-01-02,ua
48704,1279729,2021-02-11,131156,4825,410.125,400.2950,2021-01-02,ua
48705,1279731,2021-06-19,131156,2763,320.508,143.1234,2021-01-02,ua


In [32]:
dataframe_to_calculate.reindex(columns=['order_id', 'order_created_date', 'transaction_value',
       'commission', 'marketing_cost', 'user_id', 'user_created_date', 'language'])

Unnamed: 0,order_id,order_created_date,transaction_value,commission,marketing_cost,user_id,user_created_date,language
0,1110022,2020-01-10,5547,632.358,533.2977,121100,2019-11-28,en
1,1110001,2020-06-06,5836,490.224,308.5348,121100,2019-11-28,en
2,1110004,2019-12-06,5181,538.824,465.2605,121100,2019-11-28,en
3,1110010,2020-02-28,3653,292.240,222.7869,121100,2019-11-28,en
4,1110013,2019-12-05,4772,391.304,250.5300,121100,2019-11-28,en
...,...,...,...,...,...,...,...,...
48702,1279726,2021-04-13,6415,551.690,432.2970,131156,2021-01-02,ua
48703,1279727,2021-06-26,6127,618.827,470.4421,131156,2021-01-02,ua
48704,1279729,2021-02-11,4825,410.125,400.2950,131156,2021-01-02,ua
48705,1279731,2021-06-19,2763,320.508,143.1234,131156,2021-01-02,ua


In [12]:
years = np.arange(2019,2023)
years

array([2019, 2020, 2021, 2022])

In [13]:
orders.groupby([orders['order_created_date'].dt.year.rename('year'),
                orders['order_created_date'].dt.month.rename('month'),
                orders['user_id']])['order_id'].count().reset_index()

Unnamed: 0,year,month,user_id,order_id
0,2019,1,121145,2
1,2019,1,121767,1
2,2019,1,121867,1
3,2019,1,121998,1
4,2019,1,122019,4
...,...,...,...,...
26334,2022,6,129802,1
26335,2022,6,130169,1
26336,2022,6,130274,1
26337,2022,6,130314,1
