# Encoding, Seperate Features, Split Data, Scale Features, Train and Evaluate ML Model, Feature Importance, Actionable Insgihts, Segmentation

In [1]:
import pandas as pd

In [2]:
# Import cleaned data

c_data = pd.read_csv('C:\\cleaned_data.csv')
c_data.head()

Unnamed: 0,user_id,session_id,date,country,gender,category,has_account,has_price_alert,platform,average_leadout_price,leadouts
0,1591926,5169471070,2022-09-01,AT,m,Electronics,0.0,0.0,web,257.79,7.0
1,1892859,5350443957,2022-09-01,AT,f,Electronics,0.0,0.0,web,109.56,8.0
2,1818994,6003531696,2022-09-01,FR,f,Electronics,0.0,0.0,web,412.62,6.0
3,1832000,5354845851,2022-09-01,DE,f,Car Parts and Accessories,0.0,0.0,mobile web,176.83,3.0
4,1524768,7660272800,2022-09-01,FR,m,Fashion,0.0,0.0,app,43.0,10.0


In [3]:
c_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601878 entries, 0 to 601877
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                601878 non-null  int64  
 1   session_id             601878 non-null  int64  
 2   date                   601878 non-null  object 
 3   country                601878 non-null  object 
 4   gender                 601878 non-null  object 
 5   category               601878 non-null  object 
 6   has_account            601878 non-null  float64
 7   has_price_alert        601878 non-null  float64
 8   platform               601878 non-null  object 
 9   average_leadout_price  601878 non-null  float64
 10  leadouts               601878 non-null  float64
dtypes: float64(4), int64(2), object(5)
memory usage: 50.5+ MB


In [4]:
# Extracting date column
#Convert the 'date' column to a datetime data type:
c_data['date'] = pd.to_datetime(c_data['date'])

In [5]:
#Extract the desired information from the 'date' column:
c_data['year'] = c_data['date'].dt.year
c_data['month'] = c_data['date'].dt.month
c_data['day'] = c_data['date'].dt.day

In [6]:
#Drop the original 'date' column if it's no longer needed:
c_data.drop('date', axis=1, inplace=True)

In [7]:
c_data.head()

Unnamed: 0,user_id,session_id,country,gender,category,has_account,has_price_alert,platform,average_leadout_price,leadouts,year,month,day
0,1591926,5169471070,AT,m,Electronics,0.0,0.0,web,257.79,7.0,2022,9,1
1,1892859,5350443957,AT,f,Electronics,0.0,0.0,web,109.56,8.0,2022,9,1
2,1818994,6003531696,FR,f,Electronics,0.0,0.0,web,412.62,6.0,2022,9,1
3,1832000,5354845851,DE,f,Car Parts and Accessories,0.0,0.0,mobile web,176.83,3.0,2022,9,1
4,1524768,7660272800,FR,m,Fashion,0.0,0.0,app,43.0,10.0,2022,9,1


In [8]:
#To implement encoding for the given data
#Import the necessary libraries:
from sklearn.preprocessing import LabelEncoder


In [9]:
#Initialize the LabelEncoder object:

label_encoder = LabelEncoder()

In [10]:
#Encode the categorical columns:

c_data['country_encoded'] = label_encoder.fit_transform(c_data['country'])
c_data['gender_encoded'] = label_encoder.fit_transform(c_data['gender'])
c_data['category_encoded'] = label_encoder.fit_transform(c_data['category'])
c_data['platform_encoded'] = label_encoder.fit_transform(c_data['platform'])


In [11]:
c_data.head()

Unnamed: 0,user_id,session_id,country,gender,category,has_account,has_price_alert,platform,average_leadout_price,leadouts,year,month,day,country_encoded,gender_encoded,category_encoded,platform_encoded
0,1591926,5169471070,AT,m,Electronics,0.0,0.0,web,257.79,7.0,2022,9,1,0,2,1,2
1,1892859,5350443957,AT,f,Electronics,0.0,0.0,web,109.56,8.0,2022,9,1,0,1,1,2
2,1818994,6003531696,FR,f,Electronics,0.0,0.0,web,412.62,6.0,2022,9,1,2,1,1,2
3,1832000,5354845851,DE,f,Car Parts and Accessories,0.0,0.0,mobile web,176.83,3.0,2022,9,1,1,1,0,1
4,1524768,7660272800,FR,m,Fashion,0.0,0.0,app,43.0,10.0,2022,9,1,2,2,2,0


In [12]:
#Separate the features (X) and the target variable (y):

X = c_data.drop('leadouts', axis=1)
y = c_data['leadouts']

In [13]:
#Split the data into training and testing sets:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
## Drop the original categorical columns

X_train.drop(['country', 'gender', 'category', 'platform'], axis=1, inplace=True)
X_test.drop(['country', 'gender', 'category', 'platform'], axis=1, inplace=True)


In [15]:
#Normalize or scale the numeric features if necessary:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [16]:
# Train with Linear Regression

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)


In [17]:
#Once you have obtained the predicted values y_pred using your trained model, you can evaluate 
#the performance of your regression model using various evaluationmetrics

#Mean Squared Error (MSE):
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)


In [18]:
#Root Mean Squared Error (RMSE):
import numpy as np
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [19]:
#Mean Absolute Error (MAE): 
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)

In [20]:
#R-squared (Coefficient of Determination):
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)


In [21]:
#Print all results
print("Linear Regression:")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Linear Regression:
Mean Squared Error (MSE): 45.709074131384625
Root Mean Squared Error (RMSE): 6.760848625090243
Mean Absolute Error (MAE): 3.7569391913831183
R-squared (R2): 0.1086959157145776


In [22]:
#Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
import numpy as np

# Initialize the model
model = DecisionTreeRegressor()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the target variable for the test data
y_pred = model.predict(X_test)

# Evaluate the model performance
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("Decision Tree Regressor:")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Decision Tree Regressor:
Mean Squared Error (MSE): 85.8143816043065
Root Mean Squared Error (RMSE): 9.26360521634566
Mean Absolute Error (MAE): 3.9953146806672426
R-squared (R2): -0.6733375214403903


In [None]:
#Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the target variable for the test data
y_pred = model.predict(X_test)

# Evaluate the model performance
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("Random Forest Regressor:")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

In [None]:
# XGBOOST Regressor

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

#Initialize and train the XGBoost regression model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

#Make predictions:
y_pred = model.predict(X_test)

# Evaluate the model performance:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("XGBoost Regressor:")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)



In [None]:
#Feature Importance with XGBOOST

import xgboost as xgb
import matplotlib.pyplot as plt

# Train an XGBoost model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Get feature names
feature_names = X_train.columns

# Sort feature importances in descending order
sorted_indices = importances.argsort()[::-1]
sorted_importances = importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_importances)), sorted_importances, tick_label=sorted_feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importances')
plt.show()

In [None]:
#Correlation again as Results
import pandas as pd

# Create a new DataFrame with features and the target variable
df = pd.concat([X_train, y_train], axis=1)

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Sort the correlation matrix by the correlation with the target variable
correlation_with_target = correlation_matrix['leadouts'].sort_values(ascending=False)

# Print the correlation coefficients
print(correlation_with_target)


In [None]:
# Actionable Insights

# Targeted Marketing Campaigns via Demographics

import matplotlib.pyplot as plt
import numpy as np


# Identify the important demographic features
demographic_features = ['gender', 'country']  # Specify the demographic features of interest

# Segment the user base based on demographic features
segmented_data = c_data[demographic_features + ['leadouts']].copy()

# Group the data by gender and country
grouped_data = segmented_data.groupby(['gender', 'country']).sum().reset_index()

# Get unique genders and countries
genders = grouped_data['gender'].unique()
countries = grouped_data['country'].unique()

# Initialize the lead-out values for each gender and country
leadouts = np.zeros((len(genders), len(countries)))

# Fill in the lead-out values
for i, gender in enumerate(genders):
    for j, country in enumerate(countries):
        leadouts[i, j] = grouped_data[(grouped_data['gender'] == gender) & (grouped_data['country'] == country)]['leadouts']

# Set up the plot
plt.figure(figsize=(10, 6))

# Define the x-axis positions for the bars
x = np.arange(len(countries))

# Define the width of the bars
width = 0.35

# Plot the bars for each gender
for i, gender in enumerate(genders):
    plt.bar(x + i * width, leadouts[i], width, label=gender)

# Add labels, title, and legend
plt.xlabel('Country')
plt.ylabel('Lead-outs')
plt.title('Effects of Gender on Lead-outs with Country Categorization')
plt.xticks(x, countries)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Price Optimization via Average Lead Out Price

import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Create a scatter plot with regression line
sns.regplot(x='average_leadout_price', y='leadouts', data=c_data)

# Add labels and title
plt.xlabel('Average Lead-out Price')
plt.ylabel('Lead-outs')
plt.title('Effect of Average Lead-out Price on Lead-outs')

# Fit a linear regression model
X = c_data['average_leadout_price']
X = sm.add_constant(X)  # Add a constant term to the predictor variable
y = c_data['leadouts']

model = sm.OLS(y, X)
results = model.fit()

# Extract the coefficient and p-value
coefficient = results.params['average_leadout_price']
p_value = results.pvalues['average_leadout_price']

# Add the coefficient and p-value to the plot
plt.text(0.9, 0.9, f'Coefficient: {coefficient:.2f}\nP-value: {p_value:.2f}', ha='right', va='top',
         transform=plt.gca().transAxes)

plt.show()

In [None]:
# Segmentation on User_id via Leadouts


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Calculate lead-outs count for each user_id
leadouts_count = c_data['user_id'].value_counts()

# Prepare the data for clustering
X = leadouts_count.values.reshape(-1, 1)

# Perform clustering
n_clusters = 3  # Number of clusters (you can adjust this based on your data)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Assign cluster labels to user_id
leadouts_clustered = pd.DataFrame({'user_id': leadouts_count.index, 'leadouts_count': leadouts_count.values, 'cluster_label': cluster_labels})

# Categorize cluster labels into "Low", "Medium", and "High"
cluster_sizes = leadouts_clustered.groupby('cluster_label').size()
cluster_categories = ['Low', 'Medium', 'High']
leadouts_clustered['leadouts_category'] = leadouts_clustered['cluster_label'].apply(lambda x: cluster_categories[x])

# Plot the segmentation
segment_sizes = leadouts_clustered['leadouts_category'].value_counts()

plt.figure(figsize=(8, 6))
segment_sizes.plot(kind='bar', color='blue')
plt.xlabel('Lead-outs Category')
plt.ylabel('Number of Users')
plt.title('User Segmentation based on Lead-outs')
plt.xticks(rotation=0)
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Calculate lead-outs count for each user_id
leadouts_count = c_data['user_id'].value_counts()

# Prepare the data for clustering
X = leadouts_count.values.reshape(-1, 1)

# Perform clustering
n_clusters = 3  # Number of clusters (you can adjust this based on your data)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Assign cluster labels to user_id
leadouts_clustered = pd.DataFrame({'user_id': leadouts_count.index, 'leadouts_count': leadouts_count.values, 'cluster_label': cluster_labels})

# Categorize cluster labels into "Low", "Medium", and "High"
cluster_sizes = leadouts_clustered.groupby('cluster_label').size()
cluster_categories = ['Low', 'Medium', 'High']
leadouts_clustered['leadouts_category'] = leadouts_clustered['cluster_label'].apply(lambda x: cluster_categories[x])

# Plot the segmentation
segment_sizes = leadouts_clustered['leadouts_category'].value_counts()

plt.figure(figsize=(8, 6))
ax = segment_sizes.plot(kind='bar', color='blue')
plt.xlabel('Lead-outs Category')
plt.ylabel('Number of Users')
plt.title('User Segmentation based on Lead-outs')
plt.xticks(rotation=0)

# Add text annotations to each bar
for i, v in enumerate(segment_sizes):
    ax.text(i, v, str(v), ha='center', va='bottom')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Calculate lead-outs count for each user_id
leadouts_count = c_data['user_id'].value_counts()

# Prepare the data for clustering
X = leadouts_count.values.reshape(-1, 1)

# Perform clustering
n_clusters = 3  # Number of clusters (you can adjust this based on your data)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Assign cluster labels to user_id
leadouts_clustered = pd.DataFrame({'user_id': leadouts_count.index, 'leadouts_count': leadouts_count.values, 'cluster_label': cluster_labels})

# Categorize cluster labels into "Low", "Medium", and "High"
cluster_sizes = leadouts_clustered.groupby('cluster_label').size()
cluster_categories = ['Low_Leadouts', 'Medium_Leadouts', 'High_Leadouts']
leadouts_clustered['leadouts_category'] = leadouts_clustered['cluster_label'].apply(lambda x: cluster_categories[x])

# Calculate segment sizes
segment_sizes = leadouts_clustered['leadouts_category'].value_counts()

# Plot the segmentation as a pie chart
plt.figure(figsize=(8, 6))
colors = ['blue', 'orange', 'green']
explode = (0.1, 0.1, 0.1)  # Explode the slices for emphasis
plt.pie(segment_sizes, labels=segment_sizes.index, colors=colors, explode=explode, autopct='%1.1f%%', startangle=90)
plt.title('User Segmentation based on Lead-outs')

plt.axis('equal')  # Ensure a circular pie chart
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Group the data by 'country', 'has_account', and 'category' and count the number of occurrences
segmentation_counts = c_data.groupby(['country', 'has_account', 'category']).size().unstack(fill_value=0)

# Plot the segmentation using stacked bar plots
segmentation_counts.plot(kind='bar', stacked=True, figsize=(10, 6))

# Set the labels and title
plt.xlabel('Country and Account Status')
plt.ylabel('Count')
plt.title('User Distrubition based on Country, Account Status, and Category')

# Show the legend
plt.legend()

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()
