# With Rule-Based Classification Lead Yield Calculation

In [74]:
import pandas as pd
import numpy as np

In [75]:
df = pd.read_csv("persona.csv")
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   PRICE    5000 non-null   int64 
 1   SOURCE   5000 non-null   object
 2   SEX      5000 non-null   object
 3   COUNTRY  5000 non-null   object
 4   AGE      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 234.4+ KB


In [77]:
df["SOURCE"].value_counts()

android    2974
ios        2026
Name: SOURCE, dtype: int64

In [78]:
df["PRICE"].nunique()

6

In [79]:
df["PRICE"].value_counts()

29    1305
39    1260
49    1031
19     992
59     212
9      200
Name: PRICE, dtype: int64

In [80]:
df["COUNTRY"].value_counts()

usa    2065
bra    1496
deu     455
tur     451
fra     303
can     230
Name: COUNTRY, dtype: int64

In [81]:
df.groupby("COUNTRY")["PRICE"].sum()

COUNTRY
bra    51354
can     7730
deu    15485
fra    10177
tur    15689
usa    70225
Name: PRICE, dtype: int64

In [82]:
df.groupby("COUNTRY")["PRICE"].mean()

COUNTRY
bra    34.327540
can    33.608696
deu    34.032967
fra    33.587459
tur    34.787140
usa    34.007264
Name: PRICE, dtype: float64

In [83]:
df["SOURCE"].value_counts()

android    2974
ios        2026
Name: SOURCE, dtype: int64

In [84]:
df.groupby("SOURCE")["PRICE"].mean()

SOURCE
android    34.174849
ios        34.069102
Name: PRICE, dtype: float64

In [85]:
df.groupby(["COUNTRY", "SOURCE"])["PRICE"].mean()

COUNTRY  SOURCE 
bra      android    34.387029
         ios        34.222222
can      android    33.330709
         ios        33.951456
deu      android    33.869888
         ios        34.268817
fra      android    34.312500
         ios        32.776224
tur      android    36.229437
         ios        33.272727
usa      android    33.760357
         ios        34.371703
Name: PRICE, dtype: float64

In [86]:
df.groupby(["COUNTRY", "SOURCE", "SEX", "AGE"])["PRICE"].mean()

COUNTRY  SOURCE   SEX     AGE
bra      android  female  15     38.714286
                          16     35.944444
                          17     35.666667
                          18     32.255814
                          19     35.206897
                                   ...    
usa      ios      male    42     30.250000
                          50     39.000000
                          53     34.000000
                          55     29.000000
                          59     46.500000
Name: PRICE, Length: 348, dtype: float64

In [87]:
agg_df = df.groupby(["COUNTRY", "SOURCE", "SEX", "AGE"])["PRICE"].mean().sort_values(ascending=False)
agg_df

COUNTRY  SOURCE   SEX     AGE
bra      android  male    46     59.0
usa      android  male    36     59.0
fra      android  female  24     59.0
usa      ios      male    32     54.0
deu      android  female  36     49.0
                                 ... 
usa      ios      female  38     19.0
                          30     19.0
can      android  female  27     19.0
fra      android  male    18     19.0
deu      android  male    26      9.0
Name: PRICE, Length: 348, dtype: float64

In [88]:
agg_df = agg_df.reset_index()

In [89]:
bins = [0, 18, 23, 30, 40, 70]
labels = ["0_18", "19_23", "24_30", "31_40", "41_70"]
agg_df["AGE_CAT"]= pd.cut(agg_df["AGE"], bins=bins, labels=labels, right=False)
agg_df

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT
0,bra,android,male,46,59.0,41_70
1,usa,android,male,36,59.0,31_40
2,fra,android,female,24,59.0,24_30
3,usa,ios,male,32,54.0,31_40
4,deu,android,female,36,49.0,31_40
...,...,...,...,...,...,...
343,usa,ios,female,38,19.0,31_40
344,usa,ios,female,30,19.0,31_40
345,can,android,female,27,19.0,24_30
346,fra,android,male,18,19.0,19_23


In [90]:
agg_df['customers_level_based'] = agg_df.apply(lambda row: f"{row['COUNTRY']}_{row['SOURCE']}_{row['SEX']}_{row['AGE_CAT']}", axis=1)
agg_df = agg_df.drop(columns=["COUNTRY", "SOURCE", "SEX", "AGE", "AGE_CAT"])
agg_df

Unnamed: 0,PRICE,customers_level_based
0,59.0,bra_android_male_41_70
1,59.0,usa_android_male_31_40
2,59.0,fra_android_female_24_30
3,54.0,usa_ios_male_31_40
4,49.0,deu_android_female_31_40
...,...,...
343,19.0,usa_ios_female_31_40
344,19.0,usa_ios_female_31_40
345,19.0,can_android_female_24_30
346,19.0,fra_android_male_19_23


In [92]:
segments = pd.qcut(agg_df['PRICE'], q=4, labels=False)
labels = ['D', 'C', 'B', 'A']
agg_df['SEGMENT'] = [labels[x] for x in segments]

In [93]:
agg_df

Unnamed: 0,PRICE,customers_level_based,SEGMENT
0,59.0,bra_android_male_41_70,A
1,59.0,usa_android_male_31_40,A
2,59.0,fra_android_female_24_30,A
3,54.0,usa_ios_male_31_40,A
4,49.0,deu_android_female_31_40,A
...,...,...,...
343,19.0,usa_ios_female_31_40,D
344,19.0,usa_ios_female_31_40,D
345,19.0,can_android_female_24_30,D
346,19.0,fra_android_male_19_23,D


In [97]:
def predict_income(segment, age, source, sex):  
    if segment == 'A':
        if age <= 30:
            return 300 
        else:
            return 400 
    elif segment == 'B':
        return 250 
    elif segment == 'C':
        return 150 
    else:
        return 100 

predicted_income = predict_income('A', 33, 'android', 'female')
print("TUR_ANDROID_FEMALE_31_40: ", predicted_income)

predicted_income = predict_income('C', 35, 'ios', 'female')
print("FRA_IOS_FEMALE_31_40    : ", predicted_income)

TUR_ANDROID_FEMALE_31_40:  400
FRA_IOS_FEMALE_31_40    :  150
