# Import libraries and load dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_excel("sk install.xlsx", sheet_name = "Sheet1")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 16 columns):
country_code         866 non-null object
applovin_int         866 non-null int64
AppsFlyer_Test       866 non-null int64
chatbot beta         866 non-null int64
Facebook Ads         866 non-null int64
fanpage beta         866 non-null int64
fanpage pre-reg      866 non-null int64
googleadwords_int    866 non-null int64
ironsource_int       866 non-null int64
Organic              866 non-null int64
restricted           866 non-null int64
Skgroup1             866 non-null int64
unityads_int         866 non-null int64
youtube              866 non-null int64
non-organic          866 non-null int64
event_date           866 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(14), object(1)
memory usage: 108.4+ KB


In [45]:
df["Other"] = df["non-organic"] - df["Facebook Ads"] - df.googleadwords_int - df.unityads_int

# Linear Regression

In [74]:
# cross validate (train)
country_code = ["BR", "ID", "PH", "TH", "VN"]
for c in country_code:
    reg1 = LinearRegression()
    X = df.loc[df.country_code == c, ["Facebook Ads", "googleadwords_int", "unityads_int", "Other"]]
    y = df.loc[df.country_code == c,"Organic"]
    cv_results = cross_validate(reg1, X, y, cv = 10, return_train_score = True, return_estimator = True)
    print(c)
    for i, model in enumerate(cv_results["estimator"]):
        print("- Lần chạy %d:" % (i+1))
        print("R squared: %f" % cv_results["train_score"][i])
        print("K factor: %s" % model.coef_)
        print("Base organic: %f" % model.intercept_)
    print("Mean train score: %f" % cv_results["train_score"].mean())
    print("Std train score: %f" % cv_results["train_score"].std())
    print("----------------------------------------------")
    print("")

BR
- Lần chạy 1:
R squared: 0.371586
K factor: [ 0.03534142  0.64710352 -0.09603415  0.22622568]
Base organic: 0.478419
- Lần chạy 2:
R squared: 0.379615
K factor: [ 0.04047677  0.64383575 -0.10093314  0.22694222]
Base organic: 0.050733
- Lần chạy 3:
R squared: 0.505736
K factor: [ 0.05538294  0.82679067 -0.14725387  0.18453365]
Base organic: -2.154916
- Lần chạy 4:
R squared: 0.511315
K factor: [ 0.035653    0.86547474 -0.07997156  0.13981393]
Base organic: -2.934099
- Lần chạy 5:
R squared: 0.392969
K factor: [ 0.02797604  0.66687986 -0.10938689  0.22353662]
Base organic: 1.471284
- Lần chạy 6:
R squared: 0.401669
K factor: [ 0.02625823  0.66636707 -0.09624748  0.23052517]
Base organic: -0.932891
- Lần chạy 7:
R squared: 0.421819
K factor: [ 0.02090092  0.6335677  -0.10327435  0.44901599]
Base organic: 1.537083
- Lần chạy 8:
R squared: 0.403557
K factor: [ 0.02621308  0.65240548 -0.09344165  0.27660284]
Base organic: 1.296122
- Lần chạy 9:
R squared: 0.509568
K factor: [-0.00036764  

#### Kết luận
Có thể thấy Google Ads là kênh có hiệu ứng Organic Uplift tốt nhất tại tất cả các quốc gia <br>
Thái Lan có R squared rất thấp, có thể thấy hiệu ứng Organic Uplift không thể hiện rõ ràng tại quốc gia này <br>
Brazil có R squared trung bình, tuy nhiên mức base organic có xuất hiện âm cho thấy dữ liệu bị ảnh hưởng nhiều bởi outlier

In [78]:
# train all
country_code = ["ID", "PH", "VN"]
for c in country_code:
    reg1 = LinearRegression()
    X = df.loc[df.country_code == c, ["Facebook Ads", "googleadwords_int", "unityads_int", "Other"]]
    y = df.loc[df.country_code == c,"Organic"]
    reg1.fit(X, y)
    print(c)
    print("R squared: %f" % reg1.score(X, y))
    print("Organic = Facebook * %f + Google * %f + Unity * %f + Other * %f + %f" % (reg1.coef_[0], reg1.coef_[1], reg1.coef_[2], reg1.coef_[3], reg1.intercept_))
    print("---------------")

ID
R squared: 0.755180
Organic = Facebook * -0.033870 + Google * 0.098337 + Unity * 0.025391 + Other * 0.230490 + 3.974675
---------------
PH
R squared: 0.416746
Organic = Facebook * 0.011404 + Google * 0.043149 + Unity * 0.010619 + Other * 0.135102 + 4.912636
---------------
VN
R squared: 0.372660
Organic = Facebook * -0.094025 + Google * 0.107453 + Unity * -0.049135 + Other * -0.604666 + 19.058542
---------------
