# Load Data

In [1]:
import pandas as pd 
path = 'Data' # Path of the dataset folder
df_feeds = pd.read_csv(path + '/train/train_data_feeds.csv')
df_ads = pd.read_csv(path + '/train/train_data_ads.csv')
print("data is loaded...")

data is loaded...


# 1. Find Potential Customers and Non-potential Customers

- Users in the ads dataset are potential customers.
- Users in the feeds dataset and not in the ads dataset are non-potential customers.

In [2]:
user_ads = df_ads['user_id'].unique()
df_pot = df_feeds[df_feeds['u_userId'].isin(user_ads)]
df_nonpot = df_feeds.drop(df_pot.index)

# 2. Non-potential Customers Data Preprocess

- Drop the Columns with unique value > 1000 or non-numerical value.
- Label: Convert -1 to 0

In [3]:
print(df_nonpot.shape)
print(df_pot.shape)
print(df_feeds.shape)

(751889, 28)
(2475843, 28)
(3227732, 28)


In [4]:
print(df_nonpot.shape)
print(df_nonpot.nunique())

(751889, 28)
u_userId                114826
u_phonePrice                 7
u_browserLifeCycle           8
u_browserMode                8
u_feedLifeCycle              8
u_refreshTimes              10
u_newsCatInterests       73674
u_newsCatDislike           231
u_newsCatInterestsST    107272
u_click_ca2_news        151435
i_docId                  51622
i_s_sourceId              2552
i_regionEntity             364
i_cat                      207
i_entities               49133
i_dislikeTimes              10
i_upTimes                   10
i_dtype                      5
e_ch                        19
e_m                        262
e_po                        27
e_pl                      3089
e_rn                        99
e_section                    2
e_et                      3561
label                        2
cillabel                     2
pro                         35
dtype: int64


In [5]:
import numpy as np
# Importing the NumPy library, which provides support for large, multi-dimensional arrays and matrices,
# along with a large collection of high-level mathematical functions to operate on these arrays.

columns_to_drop = [column for column in df_nonpot.columns if df_nonpot[column].nunique() > 1000 or df_nonpot[column].nunique() == 1]
# Constructing a list of columns from the dataframe 'df_task' to be dropped.
# A column is added to this list if it has more than 1000 unique values, which typically
# suggests that the column contains highly granular data, possibly not useful for analysis
# or could lead to issues like overfitting if used in machine learning models.

df_nonpot = df_nonpot.drop(columns=columns_to_drop)
# Removing the columns identified in the 'columns_to_drop' list from 'df_task'.
# This operation simplifies the dataframe by excluding columns with excessive uniqueness.

df_nonpot = df_nonpot.select_dtypes(include=[np.number])
# Filtering the dataframe to include only columns that have numerical data types.
# This step is crucial for analyses that require numerical inputs, such as mathematical
# operations or statistical modeling.

print(df_nonpot.shape)
print(df_nonpot.dtypes)
print(df_nonpot.nunique())

(751889, 18)
u_phonePrice          int64
u_browserLifeCycle    int64
u_browserMode         int64
u_feedLifeCycle       int64
u_refreshTimes        int64
i_regionEntity        int64
i_cat                 int64
i_dislikeTimes        int64
i_upTimes             int64
i_dtype               int64
e_ch                  int64
e_m                   int64
e_po                  int64
e_rn                  int64
e_section             int64
label                 int64
cillabel              int64
pro                   int64
dtype: object
u_phonePrice            7
u_browserLifeCycle      8
u_browserMode           8
u_feedLifeCycle         8
u_refreshTimes         10
i_regionEntity        364
i_cat                 207
i_dislikeTimes         10
i_upTimes              10
i_dtype                 5
e_ch                   19
e_m                   262
e_po                   27
e_rn                   99
e_section               2
label                   2
cillabel                2
pro                    35
d

In [6]:
df_nonpot['label'] = df_nonpot['label'].replace({-1:0, 1:1})
df_nonpot['cillabel'] = df_nonpot['cillabel'].replace({-1:0, 1:1})

# 3. Get Training, Holdout and Validate dataset

- The sample size of non-potential customers dataset is 750000. So, we sample a smaller dataset. The sample size of the samller one is about 50000, i.e. training + holdout + validate = 50000.

In [12]:
from Dataset_Utility.utility_functions import calculate_label_rate2, get_train_holdout_validate
calculate_label_rate2(df_nonpot, 'label')
n = 50000 #sample size we want
df_label_train, df_label_holdout, df_label_val = get_train_holdout_validate(df_nonpot, 'label', n)

calculate_label_rate2(df_label_train,'label')
calculate_label_rate2(df_label_holdout,'label')
calculate_label_rate2(df_label_val,'label')

Total Sample size is 751889, Positive Sample size is 52591, Negative Sample size is 699298, label rate is 0.0752
Total Sample size is 19999, Positive Sample size is 1398, Negative Sample size is 18601, label rate is 0.0752
Total Sample size is 19999, Positive Sample size is 1398, Negative Sample size is 18601, label rate is 0.0752
Total Sample size is 9999, Positive Sample size is 699, Negative Sample size is 9300, label rate is 0.0752


In [13]:
import os
task_path = f'{path}/nonpotential_label'
if not os.path.exists(task_path):
    os.makedirs(task_path)

df_label_train.to_csv(f'{path}/nonpotential_label/df_train.csv', index=False)
df_label_holdout.to_csv(f'{path}/nonpotential_label/df_holdout.csv', index=False)
df_label_val.to_csv(f'{path}/nonpotential_label/df_val.csv', index=False)

In [14]:
calculate_label_rate2(df_nonpot, 'cillabel')
n = 50000 #sample size we want
df_cillabel_train, df_cillabel_holdout, df_cillabel_val = get_train_holdout_validate(df_nonpot, 'cillabel', n)

calculate_label_rate2(df_cillabel_train,'cillabel')
calculate_label_rate2(df_cillabel_holdout,'cillabel')
calculate_label_rate2(df_cillabel_val,'cillabel')

Total Sample size is 751889, Positive Sample size is 170, Negative Sample size is 751719, label rate is 0.0002
Total Sample size is 19999, Positive Sample size is 4, Negative Sample size is 19995, label rate is 0.0002
Total Sample size is 19999, Positive Sample size is 4, Negative Sample size is 19995, label rate is 0.0002
Total Sample size is 9999, Positive Sample size is 2, Negative Sample size is 9997, label rate is 0.0002


In [15]:
import os
task_path = f'{path}/nonpotential_cillabel'
if not os.path.exists(task_path):
    os.makedirs(task_path)

df_cillabel_train.to_csv(f'{path}/nonpotential_cillabel/df_train.csv', index=False)
df_cillabel_holdout.to_csv(f'{path}/nonpotential_cillabel/df_holdout.csv', index=False)
df_cillabel_val.to_csv(f'{path}/nonpotential_cillabel/df_val.csv', index=False)