In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import warnings

# Load the datasets
participants_df = pd.read_csv('participants.tsv', sep='\t')
groundtruth_df = pd.read_csv('groundtruth.tsv', sep='\t')

# Print information about the datasets
print("Participants columns:", participants_df.columns.tolist())
print("Groundtruth columns:", groundtruth_df.columns.tolist())

# Ensure user_id values are unique and the same in both datasets
assert participants_df['user_id'].nunique() == participants_df.shape[0], "Duplicate user_ids in participants"
assert groundtruth_df['user_id'].nunique() == groundtruth_df.shape[0], "Duplicate user_ids in groundtruth"
assert set(participants_df['user_id']) == set(groundtruth_df['user_id']), "user_id sets don't match"


Participants columns: ['user_id', 'country', 'education', 'age', 'income', 'gender', 'ad_position', 'ad_type', 'ad_category', 'serp_id', 'query', 'log_id']
Groundtruth columns: ['user_id', 'ad_clicked', 'attention', 'log_id']


In [8]:
# Display the first few rows of both dataframes to understand their structure
participants_df.head()

Unnamed: 0,user_id,country,education,age,income,gender,ad_position,ad_type,ad_category,serp_id,query,log_id
0,5npsk114ba8hfbj4jr3lt8jhf5,PHL,3,3,1,male,top-left,dd,Computers & Electronics,tablets,tablets,20181002033126
1,5o9js8slc8rg2a8mo5p3r93qm0,VEN,3,1,1,male,top-right,dd,Shop - Luxury Goods,casio-watches,casio watches,20181001211223
2,pi17qjfqmnhpsiahbumcsdq0r6,VEN,2,3,1,male,top-left,native,Shop - Luxury Goods,chivas-regal,chivas regal,20181001170952
3,3rptg9g7l83imkbdsu2miignv7,VEN,3,2,1,male,top-right,dd,Shop - Luxury Goods,chivas-regal,chivas regal,20181001140754
4,049onniafv6fe4e6q42k6nq1n2,VEN,3,5,1,male,top-left,native,Autos & Vehicles,audi-r8-used,audi r8 used,20181001132434


In [9]:
groundtruth_df.head()

Unnamed: 0,user_id,ad_clicked,attention,log_id
0,5npsk114ba8hfbj4jr3lt8jhf5,0,4,20181002033126
1,5o9js8slc8rg2a8mo5p3r93qm0,1,5,20181001211223
2,pi17qjfqmnhpsiahbumcsdq0r6,0,4,20181001170952
3,3rptg9g7l83imkbdsu2miignv7,0,1,20181001140754
4,049onniafv6fe4e6q42k6nq1n2,0,1,20181001132434


In [10]:
warnings.filterwarnings("ignore")

# Step 1: Ensure user_id values are unique in both datasets
participants_unique = participants_df['user_id'].is_unique
groundtruth_unique = groundtruth_df['user_id'].is_unique

# Step 2: Ensure the user_id sets are the same in both datasets
matching_user_ids = set(participants_df['user_id']) == set(groundtruth_df['user_id'])

# Step 3: Merge 'ad_clicked' and 'attention' columns from groundtruth_df into participants_df
merged_df = pd.merge(participants_df, groundtruth_df[['user_id', 'ad_clicked', 'attention']], on='user_id', how='inner')

# Step 4: Ensure numeric columns (education, age, income) are of numeric datatype
merged_df['education'] = pd.to_numeric(merged_df['education'], errors='coerce')
merged_df['age'] = pd.to_numeric(merged_df['age'], errors='coerce')
merged_df['income'] = pd.to_numeric(merged_df['income'], errors='coerce')

# Handling Null Values Before Creating Input Variable
# Replace null values in 'education', 'age', and 'income' with their respective medians
merged_df['education'].fillna((merged_df['education'].median()), inplace=True)
merged_df['age'].fillna((merged_df['age'].median()), inplace=True)
merged_df['income'].fillna((merged_df['income'].median()), inplace=True)

# Verify if there are any remaining null values
print(merged_df.isna().sum())

# Step 5: Update 'country' column to "non-USA" for non-"USA" values
merged_df['country'] = merged_df['country'].apply(lambda x: 'USA' if x == 'USA' else 'non-USA')

# Step 6: One-hot encode 'country', 'gender', 'ad_position', 'ad_type', 'ad_category'
categorical_columns = ['country', 'gender', 'ad_position', 'ad_type', 'ad_category']
encoded_df = pd.get_dummies(merged_df, columns=categorical_columns, drop_first=True)

print(encoded_df.head().to_string())

# Check the shape of the resulting dataframe and unique user_id consistency
print((encoded_df.shape, participants_unique, groundtruth_unique, matching_user_ids))


user_id        0
country        0
education      0
age            0
income         0
gender         0
ad_position    0
ad_type        0
ad_category    0
serp_id        0
query          0
log_id         0
ad_clicked     0
attention      0
dtype: int64
                      user_id  education  age  income        serp_id          query          log_id  ad_clicked  attention  country_non-USA  gender_male  gender_na  ad_position_top-right  ad_type_native  ad_category_Computers & Electronics  ad_category_Food & Drink  ad_category_Games  ad_category_Real Estate  ad_category_Shop - Apparel  ad_category_Shop - Event Ticket Sales  ad_category_Shop - Gifts & Special Event  ad_category_Shop - Luxury Goods  ad_category_Shop - Photo & Video Services  ad_category_Shop - Sporting Goods  ad_category_Shop - Toys  ad_category_Shop - Wholesalers & Liquidatr  ad_category_Travel
0  5npsk114ba8hfbj4jr3lt8jhf5        3.0  3.0     1.0        tablets        tablets  20181002033126           0          4        

In [11]:
# Step 1: Prepare input (X) and target (y) data
# Input columns include 'education', 'age', 'attention' and the one-hot encoded categorical columns
X = encoded_df[['education', 'age', 'attention'] + [col for col in encoded_df.columns if 'country_' in col or 'gender_' in col or 'ad_position_' in col or 'ad_type_' in col or 'ad_category_' in col]]

# Target column is 'income'
y = encoded_df['income']

# Check for any remaining missing values
missing_X = X.isnull().sum().sum()
missing_y = y.isnull().sum()

print(X.shape, y.shape, missing_X, missing_y)

missing_X, missing_y


(2909, 21) (2909,) 0 0


(0, 0)

In [12]:

# Step 3: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model 1: Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
print("Linear Regression:", (mae_linear, r2_linear))

# Model 2: Decision Tree Regression
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
y_pred_tree = tree_reg.predict(X_test)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print("Decision Tree Regression:", (mae_tree, r2_tree))

# Model 3: Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest Regression:", (mae_rf, r2_rf))

# Comparison of Model Performances
print("\nModel Comparison:")
print("--------------------")
print(f"Linear Regression:   MAE={mae_linear:.2f}, R²={r2_linear:.2f}")
print(f"Decision Tree Reg.:  MAE={mae_tree:.2f}, R²={r2_tree:.2f}")
print(f"Random Forest Reg.:  MAE={mae_rf:.2f}, R²={r2_rf:.2f}")

Linear Regression: (1.265958188632109, 0.12852141755255297)
Decision Tree Regression: (1.431631484208804, -0.49043747768261325)
Random Forest Regression: (1.2662664422728855, 0.030906642744190194)

Model Comparison:
--------------------
Linear Regression:   MAE=1.27, R²=0.13
Decision Tree Reg.:  MAE=1.43, R²=-0.49
Random Forest Reg.:  MAE=1.27, R²=0.03


Evaluating and Comparing Models:

Based on the provided results, here's an evaluation and comparison of the three regression models, followed by conclusions about the relationship between the input and target features:

Model Performance Metrics:

*   Mean Absolute Error (MAE): Measures the average magnitude of errors (in the same units as the target variable). Lower values are better.
*   R-squared (R²): Assesses the model's goodness of fit, representing the proportion of the variance in the target variable that's predictable from the input features. Higher values (up to 1) are generally better. Negative R² values indicate that the model performs worse than a simple mean predictor.

Model Comparison:

| Model | MAE | R² | Interpretation |
| --- | --- | --- | --- |
| Linear Regression | 1.26 | 0.13 | Reasonable performance, capturing about 13% of the target variance. |
| Decision Tree Regression | 1.43 | -0.49 | Poor performance; the model is not only worse than a simple mean predictor but also introduces significant error. |
| Random Forest Regression | 1.27 | 0.03 | Similar MAE to Linear Regression but with a significantly lower R², indicating it barely outperforms a mean predictor. |

Conclusions and Insights:

1.  Relationship Between Input and Target Features:
    *   Weak Relationship: The best-performing models (Linear Regression and Random Forest Regression) have relatively low R² values (0.13 and 0.03, respectively). This suggests a weak relationship between the input features and the target variable, making predictions challenging.
    *   Limited Predictive Power: The input features collectively capture only a small portion of the variance in the target variable, indicating that either important features are missing, or the relationship is highly complex/non-linear.
2.  Model Suitability:
    *   Linear Regression might be considered a baseline for simplicity, despite its moderate performance. It's straightforward to interpret but might not fully leverage the data's potential.
    *   Decision Tree Regression is not suitable for this dataset due to its poor performance.
    *   Random Forest Regression, while having a similar MAE to Linear Regression, shows minimal improvement in terms of R². It might be beneficial for more complex datasets but doesn't offer significant advantages here.
