In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.cluster import KMeans, DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
df = pd.read_csv("SL ltv30.csv")
df.head(10)

Unnamed: 0,foi.advertising_id,country,media_source,install_date,ltv1,ltv3,ltv7,ltv30,ses1,ses3,ses7
0,00001905-6d2b-45f8-a27a-0fd8110808eb,Philippines,Organic,2020-03-11,0.0,0.0,0.0,0.0,1,1,1
1,00002b18-b6fa-4f40-bae2-a50a4a6dba46,India,Organic,2020-04-20,0.0,0.0,0.0,0.0,1,4,8
2,00002cd3-6044-4433-9aa3-eea246429db6,India,googleadwords_int,2020-02-19,0.0,0.0,0.0,0.0,1,1,1
3,00004946-4ed0-4317-8bb5-34153ab133c2,Brazil,Organic,2020-03-11,0.0,0.0,0.0,0.0,1,1,1
4,00005168-2b78-4240-a28d-784dca6a4d8a,China,Organic,2020-02-14,0.0,0.0,0.0,0.0,1,1,1
5,00005879-6fe2-4d14-b59a-3638467ab6f7,Vietnam,Organic,2020-03-05,0.0,0.0,0.0,0.0,3,11,18
6,000074c6-6159-4a0a-81ca-07535596acc9,United States,Organic,2020-03-02,0.0,0.0,0.0,0.0,5,5,5
7,0000b24c-478f-475b-bb2f-72acd90dbd9c,Indonesia,,2020-04-17,0.0,0.0,0.0,0.0,1,1,2
8,0000ed04-7776-4619-981b-a2da3d6f4d2a,United States,Organic,2020-03-30,0.0,0.0,0.0,0.0,1,1,1
9,00013677-8959-4c82-8bf8-15cdc42e6759,India,,2020-04-18,0.0,0.0,0.0,0.0,1,1,1


In [15]:
df.rename(columns = {"foi.advertising_id": "advertising_id"}, inplace = True)
df.country.fillna("Unknown", inplace = True)
df.media_source.fillna("Unknown", inplace = True)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1180468 entries, 0 to 1180467
Data columns (total 11 columns):
advertising_id    1180468 non-null object
country           1180468 non-null object
media_source      1180468 non-null object
install_date      1180468 non-null object
ltv1              1180468 non-null float64
ltv3              1180468 non-null float64
ltv7              1180468 non-null float64
ltv30             1180468 non-null float64
ses1              1180468 non-null int64
ses3              1180468 non-null int64
ses7              1180468 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 99.1+ MB


In [21]:
df.media_source.value_counts() / df.advertising_id.count()

Organic              0.601553
Unknown              0.263699
googleadwords_int    0.129743
Facebook Ads         0.004451
applovin_int         0.000370
restricted           0.000138
Fanpage              0.000047
Name: media_source, dtype: float64

In [34]:
(df.groupby("country").ltv7.sum() / df.groupby("country").advertising_id.count()).sort_values(ascending = False)

country
New Caledonia       2.033426
St. Martin          0.978000
French Polynesia    0.659000
Sint Maarten        0.532000
Lithuania           0.527750
                      ...   
Mayotte             0.000000
Mauritania          0.000000
Marshall Islands    0.000000
Malta               0.000000
?land Islands       0.000000
Length: 229, dtype: float64

In [4]:
df["avg_1day"] = df.LTV1 / df.buy_1day
df["avg_3day"] = df.LTV3 / df.buy_3day
df["avg_7day"] = df.LTV7 / df.buy_7day
df["avg_30day"] = df.LTV30 / df.buy_30day
df["avg_1day"].fillna(0, inplace = True)
df["avg_3day"].fillna(0, inplace = True)
df["avg_7day"].fillna(0, inplace = True)
df["avg_30day"].fillna(0, inplace = True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 872367 entries, 0 to 872366
Data columns (total 19 columns):
advertising_id    872367 non-null object
country           872367 non-null object
install_date      872367 non-null object
buy_1day          872367 non-null int64
buy_3day          872367 non-null int64
buy_7day          872367 non-null int64
buy_30day         872367 non-null int64
LTV1              872367 non-null float64
LTV3              872367 non-null float64
LTV7              872367 non-null float64
LTV30             872367 non-null float64
session_1day      872367 non-null int64
session_3day      872367 non-null int64
session_7day      872367 non-null int64
session_30day     872367 non-null int64
avg_1day          872367 non-null float64
avg_3day          872367 non-null float64
avg_7day          872367 non-null float64
avg_30day         872367 non-null float64
dtypes: float64(8), int64(8), object(3)
memory usage: 126.5+ MB


In [6]:
df.describe()

Unnamed: 0,buy_1day,buy_3day,buy_7day,buy_30day,LTV1,LTV3,LTV7,LTV30,session_1day,session_3day,session_7day,session_30day,avg_1day,avg_3day,avg_7day,avg_30day
count,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0,872367.0
mean,0.002129,0.004626,0.006677,0.010171,0.00758,0.015042,0.020787,0.030984,1.707834,3.235785,4.387707,6.128745,0.0034,0.005441,0.006574,0.008229
std,0.088826,0.159802,0.205645,0.320076,0.645962,1.06067,1.242402,1.710492,1.486548,3.146902,5.137749,9.733935,0.151717,0.182231,0.19547,0.227665
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0,7.0,0.0,0.0,0.0,0.0
max,21.0,42.0,45.0,77.0,353.79,589.58,605.55,795.099824,184.0,196.0,212.0,311.0,29.663547,29.663547,29.663547,53.985488


In [20]:
X = df[["LTV7", "LTV3", "buy_7day", "buy_3day"]]
cluster = DBSCAN(eps=3, min_samples=2).fit(X)
cluster.labels_

MemoryError: 

In [7]:
df.corr().sort_values("LTV30", ascending = False)

Unnamed: 0,buy_1day,buy_3day,buy_7day,buy_30day,LTV1,LTV3,LTV7,LTV30,session_1day,session_3day,session_7day,session_30day,avg_1day,avg_3day,avg_7day,avg_30day
LTV30,0.483296,0.606745,0.68066,0.763973,0.601685,0.731361,0.851617,1.0,0.014587,0.029569,0.044228,0.069832,0.382804,0.452386,0.48795,0.500335
LTV7,0.606161,0.735512,0.750886,0.583121,0.793531,0.943529,1.0,0.851617,0.015272,0.029166,0.038849,0.041286,0.477987,0.545179,0.560899,0.481981
buy_30day,0.514956,0.685962,0.804739,1.0,0.354209,0.474738,0.583121,0.763973,0.025926,0.056888,0.082376,0.131068,0.298096,0.376272,0.413018,0.424054
LTV3,0.627078,0.746901,0.662692,0.474738,0.868007,1.0,0.943529,0.731361,0.015626,0.025445,0.028518,0.025992,0.512198,0.539628,0.500041,0.426689
buy_7day,0.697049,0.910458,1.0,0.804739,0.500472,0.662692,0.750886,0.68066,0.028434,0.060352,0.08016,0.087911,0.399729,0.489759,0.516044,0.447901
buy_3day,0.784987,1.0,0.910458,0.685962,0.580033,0.746901,0.735512,0.606745,0.030812,0.055497,0.062021,0.059328,0.453791,0.515671,0.483006,0.414367
LTV1,0.689634,0.580033,0.500472,0.354209,1.0,0.868007,0.793531,0.601685,0.019412,0.019829,0.019838,0.016983,0.628447,0.499007,0.465153,0.396488
avg_30day,0.372605,0.414367,0.447901,0.424054,0.396488,0.426689,0.481981,0.500335,0.021151,0.046532,0.063937,0.083615,0.63808,0.781962,0.849663,1.0
avg_7day,0.436586,0.483006,0.516044,0.413018,0.465153,0.500041,0.560899,0.48795,0.023261,0.047031,0.060885,0.062492,0.749802,0.916511,1.0,0.849663
buy_1day,1.0,0.784987,0.697049,0.514956,0.689634,0.627078,0.606161,0.483296,0.040694,0.041916,0.041131,0.036236,0.561955,0.464129,0.436586,0.372605


In [16]:
df[df.LTV30 > 0].corr().sort_values("LTV30", ascending = False)

Unnamed: 0,buy_1day,buy_3day,buy_7day,buy_30day,LTV1,LTV3,LTV7,LTV30,session_1day,session_3day,session_7day,session_30day,avg_1day,avg_3day,avg_7day,avg_30day
LTV30,0.408943,0.547253,0.642319,0.745912,0.578663,0.710927,0.837109,1.0,0.025141,0.034327,0.077452,0.132072,0.298553,0.357309,0.395737,0.410762
LTV7,0.558142,0.712803,0.742879,0.530294,0.783956,0.940274,1.0,0.837109,0.035159,0.049361,0.053483,-0.007939,0.414573,0.482769,0.50418,0.402964
buy_30day,0.377657,0.567982,0.717648,1.0,0.296624,0.419706,0.530294,0.745912,0.052741,0.123966,0.202355,0.31789,0.110434,0.129135,0.137243,0.123334
LTV3,0.595528,0.743699,0.654431,0.419706,0.861946,1.0,0.940274,0.710927,0.047269,0.048222,0.006746,-0.056016,0.466752,0.49743,0.453175,0.361247
buy_7day,0.616557,0.878573,1.0,0.717648,0.477028,0.654431,0.742879,0.642319,0.069561,0.152751,0.168253,0.032241,0.23867,0.281837,0.28131,0.149862
LTV1,0.679494,0.564427,0.477028,0.296624,1.0,0.861946,0.783956,0.578663,0.081431,0.027924,-0.026568,-0.070465,0.609072,0.470105,0.435775,0.352756
buy_3day,0.733017,1.0,0.878573,0.567982,0.564427,0.743699,0.712803,0.547253,0.09997,0.148184,0.052122,-0.089589,0.325457,0.34677,0.272259,0.14993
avg_30day,0.159731,0.14993,0.149862,0.123334,0.352756,0.361247,0.402964,0.410762,-0.004708,-0.064357,-0.072296,-0.058986,0.550069,0.686887,0.765518,1.0
buy_1day,1.0,0.733017,0.616557,0.377657,0.679494,0.595528,0.558142,0.408943,0.186384,0.075802,-0.052702,-0.145753,0.478346,0.320856,0.264035,0.159731
avg_7day,0.264035,0.272259,0.28131,0.137243,0.435775,0.453175,0.50418,0.395737,0.024187,-0.015041,-0.050266,-0.152778,0.699284,0.884506,1.0,0.765518


In [8]:
reg = LinearRegression()
scores = cross_validate(reg, df[["LTV7", "LTV3", "buy_7day", "buy_3day"]], df["LTV30"], cv = 5, scoring = ["r2"], return_train_score = True, return_estimator = True)

In [9]:
scores

{'fit_time': array([0.3495822 , 0.09374905, 0.08975983, 0.08776569, 0.0867691 ]),
 'score_time': array([0.02493382, 0.00997376, 0.00897646, 0.00897598, 0.0099721 ]),
 'estimator': (LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)),
 'test_r2': array([0.7009487 , 0.81210016, 0.84258822, 0.76065991, 0.7377754 ]),
 'train_r2': array([0.82526697, 0.76937192, 0.76322681, 0.77429145, 0.77480582])}

In [10]:
df[df.LTV7 > 0].advertising_id.count() / df.advertising_id.count() * 100

0.26514070339662094

In [11]:
df[df.LTV30 > 0].advertising_id.count() / df.advertising_id.count() * 100

0.33311668139670575

In [12]:
for model in scores["estimator"]:
    print(model.coef_)

[ 1.35214672 -0.37818803  1.19609924 -0.85759468]
[ 2.03920718 -1.12429827 -0.12904057  0.50257683]
[ 2.18401936 -1.28644748 -0.5175493   1.04349367]
[ 2.13892912 -1.22023565 -0.44313322  0.80637489]
[ 2.38226009 -1.43469755 -0.97598338  1.29955548]


In [13]:
X = df[["LTV7"]]
kmeans = KMeans(n_clusters = 2, random_state = 0).fit(X)
kmeans.labels_

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
pd.Series(kmeans.labels_).value_counts()

0    872326
1        41
dtype: int64