In [16]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

demographic_df  = pd.read_csv("/Users/akhilnair/Downloads/demographics.txt", delimiter='\t')
print(demographic_df.head())

with open("/Users/akhilnair/Downloads/behaviour.json") as behaviour_file:
    behaviour_data = json.load(behaviour_file)
with open("/Users/akhilnair/Downloads/campaign.json") as campaign_file:
    campaign_data = json.load(campaign_file)

# Converting the JSON data to DataFrames
behaviour_list = [list(item.values())[0] for item in behaviour_data]
behaviour_ids = [list(item.keys())[0].replace('ID_', '') for item in behaviour_data]
behaviour_df = pd.DataFrame(behaviour_list)
behaviour_df['ID'] = behaviour_ids
print(behaviour_df.head())

campaign_list = [list(item.values())[0] for item in campaign_data]
campaign_ids = [list(item.keys())[0].replace('ID_', '') for item in campaign_data]
campaign_df = pd.DataFrame(campaign_list)
campaign_df['ID'] = campaign_ids
print(campaign_df.head())

# Converting ID columns to string type for consistent joining
demographic_df['ID'] = demographic_df['ID'].astype(str)
behaviour_df['ID'] = behaviour_df['ID'].astype(str)
campaign_df['ID'] = campaign_df['ID'].astype(str)

# Merging the datasets on ID
consolidated_df = demographic_df.merge(behaviour_df, on='ID').merge(campaign_df, on='ID')

# Removing extra spaces in column names
consolidated_df.columns = consolidated_df.columns.str.strip()
consolidated_df

# Cleaning and convert the Income column
consolidated_df['Income'] = consolidated_df['Income'].replace('[\$,]', '', regex=True).astype(float)

      ID  Year_Birth   Education Marital_Status      Income   Kidhome  \
0   1826        1970  Graduation       Divorced  $84,835.00         0   
1      1        1961  Graduation         Single  $57,091.00         0   
2  10476        1958  Graduation        Married  $67,267.00         0   
3   1386        1967  Graduation       Together  $32,474.00         1   
4   5371        1989  Graduation         Single  $21,474.00         1   

   Teenhome Dt_Customer Country  
0         0     6/16/14      SP  
1         0     6/15/14      CA  
2         1     5/13/14      US  
3         1     5/11/14     AUS  
4         0      4/8/14      SP  
   Recency  MntWines  MntFruits  MntMeatProducts  MntFishProducts  \
0        0       189        104              379              111   
1        0       464          5               64                7   
2        0       134         11               59               15   
3        0        10          0                1                0   
4        0  

In [18]:
# Removing extra spaces in column names
consolidated_df.columns = consolidated_df.columns.str.strip()
consolidated_df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Country,Recency,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/14,SP,0,...,4,6,1,0,0,0,0,0,1,0
1,1,1961,Graduation,Single,57091.0,0,0,6/15/14,CA,0,...,3,7,5,0,1,0,0,0,1,0
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/14,US,0,...,2,5,2,0,0,0,0,0,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,5/11/14,AUS,0,...,0,2,7,0,0,0,0,0,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,4/8/14,SP,0,...,1,2,7,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142,1976,PhD,Divorced,66476.0,0,1,3/7/13,US,99,...,2,11,4,0,0,0,0,0,0,0
2236,5263,1977,2n Cycle,Married,31056.0,1,0,1/22/13,SP,99,...,0,3,8,0,0,0,0,0,0,0
2237,22,1976,Graduation,Divorced,46310.0,1,0,12/3/12,SP,99,...,1,5,8,0,0,0,0,0,0,0
2238,528,1978,Graduation,Married,65819.0,0,0,11/29/12,IND,99,...,4,10,3,0,0,0,0,0,0,0


In [19]:
# Cleaning and convert the Income column
consolidated_df['Income'] = consolidated_df['Income'].replace('[\$,]', '', regex=True).astype(float)

In [5]:
# Defining feature columns and target variable
features = ['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 
            'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
            'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
            'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 
            'NumWebVisitsMonth']
target = 'Response'

# Converting categorical columns to dummy variables
consolidated_df = pd.get_dummies(consolidated_df, columns=['Education', 'Marital_Status', 'Country'], drop_first=True)

In [6]:
# Handling missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = consolidated_df[features]
X = imputer.fit_transform(X)  
y = consolidated_df[target]

In [7]:
# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

In [12]:
# Evaluating Logistic Regression model
print("Logistic Regression Model")
log_reg_report = classification_report(y_test, y_pred_log_reg)
print(log_reg_report)

Logistic Regression Model
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       567
           1       0.55      0.20      0.29       105

    accuracy                           0.85       672
   macro avg       0.71      0.59      0.60       672
weighted avg       0.82      0.85      0.82       672



In [13]:
log_reg_confusion_matrix = confusion_matrix(y_test, y_pred_log_reg)
print(log_reg_confusion_matrix)

[[550  17]
 [ 84  21]]


In [20]:
# Training Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

In [21]:
# Evaluating Random Forest model
print("Random Forest Model")
rf_report = classification_report(y_test, y_pred_rf)
print(rf_report)
rf_confusion_matrix = confusion_matrix(y_test, y_pred_rf)
print(rf_confusion_matrix)

Random Forest Model
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       567
           1       0.63      0.30      0.40       105

    accuracy                           0.86       672
   macro avg       0.76      0.63      0.66       672
weighted avg       0.84      0.86      0.84       672

[[549  18]
 [ 74  31]]
