In [52]:
import os
import pandas as pd

In [53]:
DATASET_PATH = "dataset"

In [54]:
df = pd.read_csv(os.path.join(DATASET_PATH, "7k dataset.txt"))
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,RLD_IND_BEFORE,RLD_IND_AFTER,RLD_AMT_BEFORE,RLD_AMT_AFTER,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,Y,Y,30.00,30.00,?,?,26.000000,17.000000,2,DURING & AFTER CAMP
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,N,N,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,N,N,?,?,?,?,?,?,38,NO ACTIVITY SINCE SEP
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,N,N,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,N,N,?,?,?,?,?,?,5,DURING & AFTER CAMP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,28,27,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,N,N,?,?,?,?,?,?,0,DURING & AFTER CAMP
7268,48,25,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/8/2022,Y,...,Y,N,5.00,?,0.15,?,8.730000,3.120000,0,DURING & AFTER CAMP
7269,71,65,Male,Malaysia,PAHANG,Active,Active,Y,10/6/2022,Y,...,N,Y,?,40.00,?,?,29.980000,40.000000,0,DURING & AFTER CAMP
7270,30,33,Male,Malaysia,JOHORE,Active,Active,N,?,N,...,Y,N,20.00,?,?,?,17.000000,?,36,NO ACTIVITY SINCE SEP


In [55]:
df.columns

Index(['TENURE', 'AGE', 'GENDER', 'NATIONALITY', 'STATE', 'STATUS_BEFORE',
       'STATUS_AFTER', 'OFFER_TAKER', 'OFFER_TAKE_UP_DT', 'DATA_PURC_BEFORE',
       'DATA_PURC_AFTER', 'DATA_CHRG_BEFORE', 'DATA_CHRG_AFTER',
       'DATA_USG_BEFORE', 'DATA_USG_AFTER', 'VOICE_USG_BEFORE',
       'VOICE_USG_AFTER', 'RLD_IND_BEFORE', 'RLD_IND_AFTER', 'RLD_AMT_BEFORE',
       'RLD_AMT_AFTER', 'CPA_RVN_BEFORE', 'CPA_RVN_AFTER', 'ARPU_BEFORE',
       'ARPU_AFTER', 'ACTVIITY_DAYS_AFTER', 'ACTIVITY_STATUS_AFTER'],
      dtype='object')

## Requirements for midterms
1. Dataset has enough attributes for star schema drill down and roll up.
2. Random values generated for newly added attributes.
3. Resulting dataset is 10% of the original dataset, sampled randomly.

### Existing attributes
| Variable | Description |
| --- | --- | 
| ID | Customer ID
| TENURE | Customer duration with Kation since registration date 
| AGE | Customer age 
| GENDER | Customer gender 
| NATIONALITY | Customer nationality 
| STATE | Customer hometown (state) 
| STATUS_BEFORE | Customer status before campaign launched. 
| STATUS_AFTER | Customer status after campaign ended. 
| OFFER_TAKER | Indicator for customers who opted-in the migration plan. 
| OFFER_TAKE_UP_DT | Date for customers who opted-in the migration plan. 
| DATA_PURC_BEFORE | Indicator for customer who purchased data before campaign launched. 
| DATA_PURC_AFTER | Indicator for customer who purchased data after campaign ended. 
| DATA_CHRG_BEFORE | Total amount of data charged before campaign launched. 
| DATA_CHRG_AFTER | Total amount of data charged after campaign ended. 
| DATA_USG_BEFORE | Data usage before campaign launched. 
| DATA_USG_AFTER | Data usage after campaign ended. 
| VOICE_USG_BEFORE | Voice usage before campaign launched. 
| VOICE_USG_AFTER | Voice usage after campaign ended. 
| RLD_IND_BEFORE | Indicator for customer who reload before campaign launched. 
| RLD_IND_AFTER | Indicator for customer who reload after campaign ended. 
| RLD_AMT_BEFORE | Total of reload amount before campaign launched. 
| RLD_AMT_AFTER | Total of reload amount after campaign ended. 
| ARPU_BEFORE | ARPU before campaign launched. 
| CPA_RVN_BEFORE | Total added value service before campaign launched. 
| CPA_RVN_AFTER | Total added value service after campaign ended. 
| ARPU_AFTER | ARPU after campaign ended. 
| ACTIVITY_DAYS_AFTER | Silent days after campaign ended. 
| ACTIVITY_STATUS_AFTER | Customer activity status after campaign ended. 

### Proposed new attributes
| Variable | Description |
| --- | --- | 
| PLAN_ID | Telco plan ID
| PLAN_TYPE | Type of telco plan
| VAS_ID | Value added service ID
| VAS_TYPE | Type of value added service
| DATE_ID | Original date
| YEAR | Year
| QUARTER | Quarter
| Month | Month
| DAY | Day

### Values for new attributes
Telco Plans
| PLAN_ID | PLAN_TYPE |
| --- | --- | 
| 1 | Saver
| 2 | Premium

Value added services
| VAS_ID | VAS_TYPE |
| --- | --- | 
| 1 | Caller_tune
| 2 | Roaming

![Star schema](media/star_schema.jpg)

## Create new attributes

New attributes:
- `PLAN_ID`, `PLAN_TYPE`
- `VAS_ID`, `VAS_TYPE`
- `DATE_ID`, `YEAR`, `QUARTER`, `MONTH`, `DAY`

New attributes `PLAN_ID` and `PLAN_TYPE` will be created and their values are created randomly.

In [56]:
customer_id = [i for i in range(len(df))]

df["ID"] = customer_id
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,RLD_IND_AFTER,RLD_AMT_BEFORE,RLD_AMT_AFTER,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,Y,30.00,30.00,?,?,26.000000,17.000000,2,DURING & AFTER CAMP,0
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,N,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,1
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,N,?,?,?,?,?,?,38,NO ACTIVITY SINCE SEP,2
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,N,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,3
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,N,?,?,?,?,?,?,5,DURING & AFTER CAMP,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,28,27,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,N,?,?,?,?,?,?,0,DURING & AFTER CAMP,7267
7268,48,25,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/8/2022,Y,...,N,5.00,?,0.15,?,8.730000,3.120000,0,DURING & AFTER CAMP,7268
7269,71,65,Male,Malaysia,PAHANG,Active,Active,Y,10/6/2022,Y,...,Y,?,40.00,?,?,29.980000,40.000000,0,DURING & AFTER CAMP,7269
7270,30,33,Male,Malaysia,JOHORE,Active,Active,N,?,N,...,N,20.00,?,?,?,17.000000,?,36,NO ACTIVITY SINCE SEP,7270


In [57]:
import random

plan_id = range(1, 2 + 1)
plan_ids = random.choices(plan_id, k=len(df))
len(plan_ids)

7272

In [58]:
df["PLAN_ID"] = plan_ids
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,RLD_AMT_BEFORE,RLD_AMT_AFTER,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,30.00,30.00,?,?,26.000000,17.000000,2,DURING & AFTER CAMP,0,1
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,1,1
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,?,?,?,?,?,?,38,NO ACTIVITY SINCE SEP,2,2
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,?,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,3,2
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,?,?,?,?,?,?,5,DURING & AFTER CAMP,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,28,27,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,?,?,0,DURING & AFTER CAMP,7267,2
7268,48,25,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/8/2022,Y,...,5.00,?,0.15,?,8.730000,3.120000,0,DURING & AFTER CAMP,7268,2
7269,71,65,Male,Malaysia,PAHANG,Active,Active,Y,10/6/2022,Y,...,?,40.00,?,?,29.980000,40.000000,0,DURING & AFTER CAMP,7269,1
7270,30,33,Male,Malaysia,JOHORE,Active,Active,N,?,N,...,20.00,?,?,?,17.000000,?,36,NO ACTIVITY SINCE SEP,7270,2


In [59]:
plans = {1: "Saver", 2: "Premium"}

df["PLAN_TYPE"] = df["PLAN_ID"].apply(lambda x: plans.get(x))
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,RLD_AMT_AFTER,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,30.00,?,?,26.000000,17.000000,2,DURING & AFTER CAMP,0,1,Saver
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,1,1,Saver
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,?,?,?,?,?,38,NO ACTIVITY SINCE SEP,2,2,Premium
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,?,?,?,?,?,37,NO ACTIVITY SINCE SEP,3,2,Premium
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,?,?,?,?,?,5,DURING & AFTER CAMP,4,2,Premium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,28,27,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,?,0,DURING & AFTER CAMP,7267,2,Premium
7268,48,25,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/8/2022,Y,...,?,0.15,?,8.730000,3.120000,0,DURING & AFTER CAMP,7268,2,Premium
7269,71,65,Male,Malaysia,PAHANG,Active,Active,Y,10/6/2022,Y,...,40.00,?,?,29.980000,40.000000,0,DURING & AFTER CAMP,7269,1,Saver
7270,30,33,Male,Malaysia,JOHORE,Active,Active,N,?,N,...,?,?,?,17.000000,?,36,NO ACTIVITY SINCE SEP,7270,2,Premium


New attributes `VAS_ID` and `VAS_TYPE` will be created and their values are created randomly.

This will be done in a few cycles as some customers enjoy more than one value-added service. Available value-added service:
1. Caller_tunes
2. Data_roaming
3. Idd_call
4. Games_credit
5. Sports_stream
6. Video_stream
7. Music_stream

In [60]:
vas_id = range(1, 3 + 1)
vas_ids = random.choices(vas_id, k=len(df))

df["VAS_ID"] = vas_ids
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,?,26.000000,17.000000,2,DURING & AFTER CAMP,0,1,Saver,1
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,37,NO ACTIVITY SINCE SEP,1,1,Saver,3
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,?,?,?,?,38,NO ACTIVITY SINCE SEP,2,2,Premium,3
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,?,?,?,?,37,NO ACTIVITY SINCE SEP,3,2,Premium,3
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,?,?,?,?,5,DURING & AFTER CAMP,4,2,Premium,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,28,27,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,0,DURING & AFTER CAMP,7267,2,Premium,3
7268,48,25,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/8/2022,Y,...,0.15,?,8.730000,3.120000,0,DURING & AFTER CAMP,7268,2,Premium,3
7269,71,65,Male,Malaysia,PAHANG,Active,Active,Y,10/6/2022,Y,...,?,?,29.980000,40.000000,0,DURING & AFTER CAMP,7269,1,Saver,2
7270,30,33,Male,Malaysia,JOHORE,Active,Active,N,?,N,...,?,?,17.000000,?,36,NO ACTIVITY SINCE SEP,7270,2,Premium,2


In [61]:
# Randomly choose some instances
df2 = df.sample(frac=0.3)

# Generate VAS ID
vas_id = range(4, 5 + 1)
vas_ids = random.choices(vas_id, k=len(df2))

# Replace vas_ids with the newly generated one
df2["VAS_ID"] = vas_ids
df2

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID
7114,21,29,Male,Malaysia,JOHORE,Active,Active,Y,10/5/2022,Y,...,?,?,?,0.100000,2,BEFORE & AFTER CAMP,7114,1,Saver,4
494,30,48,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,0,DURING & AFTER CAMP,494,1,Saver,4
546,50,67,Female,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,4,BEFORE & AFTER CAMP,546,2,Premium,4
3196,107,29,Male,Malaysia,PERAK,Active,Active,N,?,N,...,?,?,?,3.000000,1,DURING & AFTER CAMP,3196,2,Premium,4
4477,83,32,Female,Malaysia,KLANG VALLEY,Active,Active,N,?,N,...,1.00,?,?,?,37,NO ACTIVITY SINCE SEP,4477,2,Premium,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6396,24,24,Female,Malaysia,JOHORE,Active,Active,Y,10/5/2022,Y,...,?,?,20.700000,14.400000,1,DURING & AFTER CAMP,6396,2,Premium,4
1185,31,32,Male,Malaysia,KEDAH,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,0,DURING & AFTER CAMP,1185,1,Saver,5
5437,36,35,Male,Malaysia,PENANG,Active,Active,N,?,Y,...,?,?,17.900000,1.400000,15,DURING & AFTER CAMP,5437,2,Premium,4
6653,37,62,Female,Malaysia,N SEMBILAN,Active,Active,Y,10/5/2022,Y,...,?,?,0.200000,?,34,NO ACTIVITY SINCE SEP,6653,2,Premium,4


In [62]:
# Randomly choose some instances
df3 = df.sample(frac=0.5)

# Generate VAS ID
vas_id = range(6, 7 + 1)
vas_ids = random.choices(vas_id, k=len(df3))

# Replace vas_ids with the newly generated one
df3["VAS_ID"] = vas_ids
df3

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID
130,176,27,Female,Malaysia,PERAK,Active,Active,Y,10/5/2022,Y,...,?,?,?,10.300000,7,DURING & AFTER CAMP,130,1,Saver,6
5952,46,24,Female,Malaysia,KLANG VALLEY,Active,Active,Y,10/5/2022,Y,...,?,?,?,6.000000,16,BEFORE & AFTER CAMP,5952,1,Saver,7
247,15,24,Male,Malaysia,SABAH,Active,Active,Y,10/6/2022,Y,...,?,?,0.350000,0.100000,0,BEFORE & AFTER CAMP,247,2,Premium,7
347,91,54,Female,Malaysia,KELANTAN,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,7,BEFORE & AFTER CAMP,347,1,Saver,6
204,84,62,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,2.600000,?,34,NO ACTIVITY SINCE SEP,204,1,Saver,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,52,84,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,38,NO ACTIVITY SINCE SEP,5606,2,Premium,6
4612,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,?,?,8.000000,17.160000,6,BEFORE & AFTER CAMP,4612,2,Premium,7
4099,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,?,?,0.900000,?,3,DURING & AFTER CAMP,4099,2,Premium,6
3266,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,8.000000,3,DURING & AFTER CAMP,3266,1,Saver,7


In [63]:
# Concat all df
df = pd.concat([df, df2, df3])
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,CPA_RVN_BEFORE,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,?,26.000000,17.000000,2,DURING & AFTER CAMP,0,1,Saver,1
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,37,NO ACTIVITY SINCE SEP,1,1,Saver,3
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,?,?,?,?,38,NO ACTIVITY SINCE SEP,2,2,Premium,3
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,?,?,?,?,37,NO ACTIVITY SINCE SEP,3,2,Premium,3
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,?,?,?,?,5,DURING & AFTER CAMP,4,2,Premium,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,52,84,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,?,?,?,?,38,NO ACTIVITY SINCE SEP,5606,2,Premium,6
4612,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,?,?,8.000000,17.160000,6,BEFORE & AFTER CAMP,4612,2,Premium,7
4099,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,?,?,0.900000,?,3,DURING & AFTER CAMP,4099,2,Premium,6
3266,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,8.000000,3,DURING & AFTER CAMP,3266,1,Saver,7


In [64]:
vas = {1: "Caller_tune", 2: "Data_roaming", 3: "Idd_call", 4: "Games_credit", 5: "Sport_stream", 6: "Video_stream", 7: "Music_stream"}

df["VAS_TYPE"] = df["VAS_ID"].apply(lambda x: vas.get(x))
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,CPA_RVN_AFTER,ARPU_BEFORE,ARPU_AFTER,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,?,26.000000,17.000000,2,DURING & AFTER CAMP,0,1,Saver,1,Caller_tune
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,?,37,NO ACTIVITY SINCE SEP,1,1,Saver,3,Idd_call
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,?,?,?,38,NO ACTIVITY SINCE SEP,2,2,Premium,3,Idd_call
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,?,?,?,37,NO ACTIVITY SINCE SEP,3,2,Premium,3,Idd_call
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,?,?,?,5,DURING & AFTER CAMP,4,2,Premium,3,Idd_call
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,52,84,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,?,?,?,38,NO ACTIVITY SINCE SEP,5606,2,Premium,6,Video_stream
4612,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,?,8.000000,17.160000,6,BEFORE & AFTER CAMP,4612,2,Premium,7,Music_stream
4099,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,?,0.900000,?,3,DURING & AFTER CAMP,4099,2,Premium,6,Video_stream
3266,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,?,?,8.000000,3,DURING & AFTER CAMP,3266,1,Saver,7,Music_stream


New attributes `DATE_ID` will be created from `OFFER_TAKE_UP_DT`, while `YEAR`, `QUARTER`, `MONTH` and `DAY` will be created and their values are created randomly.

In [65]:
df["YEAR"] = df["OFFER_TAKE_UP_DT"].apply(lambda x: x.split("/")[2] if (x != "?") else "?")
df["MONTH"] = df["OFFER_TAKE_UP_DT"].apply(lambda x: x.split("/")[0] if (x != "?") else "?")
df["DAY"] = df["OFFER_TAKE_UP_DT"].apply(lambda x: x.split("/")[1] if (x != "?") else "?")
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,ACTVIITY_DAYS_AFTER,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE,YEAR,MONTH,DAY
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,2,DURING & AFTER CAMP,0,1,Saver,1,Caller_tune,2022,10,6
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,37,NO ACTIVITY SINCE SEP,1,1,Saver,3,Idd_call,2022,10,6
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,38,NO ACTIVITY SINCE SEP,2,2,Premium,3,Idd_call,?,?,?
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,37,NO ACTIVITY SINCE SEP,3,2,Premium,3,Idd_call,?,?,?
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,5,DURING & AFTER CAMP,4,2,Premium,3,Idd_call,2022,10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,52,84,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,38,NO ACTIVITY SINCE SEP,5606,2,Premium,6,Video_stream,2022,10,6
4612,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,6,BEFORE & AFTER CAMP,4612,2,Premium,7,Music_stream,2022,10,5
4099,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,3,DURING & AFTER CAMP,4099,2,Premium,6,Video_stream,?,?,?
3266,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,3,DURING & AFTER CAMP,3266,1,Saver,7,Music_stream,2022,10,6


In [66]:
df["QUARTER"] = df["MONTH"].apply(lambda x: (int(x)-1)//3 + 1 if x != "?" else "?")
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,OFFER_TAKE_UP_DT,DATA_PURC_BEFORE,...,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE,YEAR,MONTH,DAY,QUARTER
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,0,1,Saver,1,Caller_tune,2022,10,6,4
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,1,1,Saver,3,Idd_call,2022,10,6,4
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,2,2,Premium,3,Idd_call,?,?,?,?
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,3,2,Premium,3,Idd_call,?,?,?,?
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,DURING & AFTER CAMP,4,2,Premium,3,Idd_call,2022,10,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,52,84,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,5606,2,Premium,6,Video_stream,2022,10,6,4
4612,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,BEFORE & AFTER CAMP,4612,2,Premium,7,Music_stream,2022,10,5,4
4099,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,DURING & AFTER CAMP,4099,2,Premium,6,Video_stream,?,?,?,?
3266,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,3266,1,Saver,7,Music_stream,2022,10,6,4


In [67]:
df = df.rename(columns={"OFFER_TAKE_UP_DT": "DATE_ID"})

### Incorrect data

In [68]:
df = df.reset_index(drop=True)

In [69]:
age_index = random.sample(range(len(df)), round(len(df) * .025))
len(age_index)

327

In [70]:
for i in age_index:
    df.loc[i, "AGE"] = 0

df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,DATE_ID,DATA_PURC_BEFORE,...,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE,YEAR,MONTH,DAY,QUARTER
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,0,1,Saver,1,Caller_tune,2022,10,6,4
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,1,1,Saver,3,Idd_call,2022,10,6,4
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,2,2,Premium,3,Idd_call,?,?,?,?
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,3,2,Premium,3,Idd_call,?,?,?,?
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,DURING & AFTER CAMP,4,2,Premium,3,Idd_call,2022,10,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13085,52,0,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,5606,2,Premium,6,Video_stream,2022,10,6,4
13086,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,BEFORE & AFTER CAMP,4612,2,Premium,7,Music_stream,2022,10,5,4
13087,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,DURING & AFTER CAMP,4099,2,Premium,6,Video_stream,?,?,?,?
13088,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,3266,1,Saver,7,Music_stream,2022,10,6,4


In [71]:
df["AGE"].value_counts().sort_index()

-9999     22
 0       327
 19        2
 20        5
 21       64
        ... 
 90       13
 95        1
 98        2
 99        1
 100       2
Name: AGE, Length: 78, dtype: int64

In [72]:
tenure_index = random.sample(range(len(df)), round(len(df) * .025))
len(tenure_index)

327

In [73]:
for i in tenure_index:
    df.loc[i, "TENURE"] = 520

df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,DATE_ID,DATA_PURC_BEFORE,...,ACTIVITY_STATUS_AFTER,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE,YEAR,MONTH,DAY,QUARTER
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,0,1,Saver,1,Caller_tune,2022,10,6,4
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,1,1,Saver,3,Idd_call,2022,10,6,4
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,2,2,Premium,3,Idd_call,?,?,?,?
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,NO ACTIVITY SINCE SEP,3,2,Premium,3,Idd_call,?,?,?,?
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,DURING & AFTER CAMP,4,2,Premium,3,Idd_call,2022,10,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13085,52,0,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,NO ACTIVITY SINCE SEP,5606,2,Premium,6,Video_stream,2022,10,6,4
13086,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,BEFORE & AFTER CAMP,4612,2,Premium,7,Music_stream,2022,10,5,4
13087,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,DURING & AFTER CAMP,4099,2,Premium,6,Video_stream,?,?,?,?
13088,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,DURING & AFTER CAMP,3266,1,Saver,7,Music_stream,2022,10,6,4


In [74]:
df["TENURE"].value_counts().sort_index()

3        2
4        2
5        2
6        2
8        3
      ... 
293      3
294      3
295      2
296      3
520    327
Name: TENURE, Length: 269, dtype: int64

In [75]:
df["BIRTHDAY"] = "1/1/2000"
df

Unnamed: 0,TENURE,AGE,GENDER,NATIONALITY,STATE,STATUS_BEFORE,STATUS_AFTER,OFFER_TAKER,DATE_ID,DATA_PURC_BEFORE,...,ID,PLAN_ID,PLAN_TYPE,VAS_ID,VAS_TYPE,YEAR,MONTH,DAY,QUARTER,BIRTHDAY
0,133,33,Male,Malaysia,KLANG VALLEY,Active,Active,Y,10/6/2022,Y,...,0,1,Saver,1,Caller_tune,2022,10,6,4,1/1/2000
1,37,41,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,1,1,Saver,3,Idd_call,2022,10,6,4,1/1/2000
2,44,21,Male,Malaysia,SARAWAK,Active,Active,N,?,N,...,2,2,Premium,3,Idd_call,?,?,?,?,1/1/2000
3,176,69,Female,Malaysia,PENANG,Active,Active,N,?,N,...,3,2,Premium,3,Idd_call,?,?,?,?,1/1/2000
4,171,79,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,N,...,4,2,Premium,3,Idd_call,2022,10,6,4,1/1/2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13085,52,0,Male,Malaysia,JOHORE,Active,Active,Y,10/6/2022,Y,...,5606,2,Premium,6,Video_stream,2022,10,6,4,1/1/2000
13086,168,65,Female,Malaysia,SARAWAK,Active,Active,Y,10/5/2022,N,...,4612,2,Premium,7,Music_stream,2022,10,5,4,1/1/2000
13087,18,82,Male,Malaysia,PERAK,Active,Active,N,?,N,...,4099,2,Premium,6,Video_stream,?,?,?,?,1/1/2000
13088,107,31,Male,Malaysia,SARAWAK,Active,Active,Y,10/6/2022,Y,...,3266,1,Saver,7,Music_stream,2022,10,6,4,1/1/2000


In [76]:
df.to_csv("final_dataset.csv", index=False)