In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
file_path = '/Users/allenyang/Downloads/project-clean-data.csv'
data = pd.read_csv(file_path)

# Remove rows with NaN values in any relevant columns
data_cleaned = data.dropna(subset=['F_RACECMB', 'F_EDUCCAT2', 'F_AGECAT', 'F_INC_SDT1', 'F_CREGION', 'F_RELIG', 'F_PARTY_FINAL', 'Social Media'])

# Selecting relevant columns and renaming them
data_cleaned = data_cleaned[['F_RACECMB', 'F_EDUCCAT2', 'F_AGECAT', 'F_INC_SDT1', 'F_CREGION', 'F_RELIG', 'F_PARTY_FINAL', 'Social Media']]
data_cleaned.columns = ['Race', 'Education', 'Age', 'Income', 'Region', 'Religion', 'Party', 'Attitude']

# Mapping target variable
attitude_mapping = {'Bad idea for society': 0, 'Good idea for society': 1, 'Neither good nor bad': 2}
data_cleaned['Attitude'] = data_cleaned['Attitude'].map(attitude_mapping)

# Ensure no NaN values exist in the cleaned data
data_cleaned = data_cleaned.dropna()

# Encode categorical variables
label_encoders = {}
for column in ['Race', 'Education', 'Age', 'Income', 'Region', 'Religion', 'Party']:
    le = LabelEncoder()
    data_cleaned[column] = le.fit_transform(data_cleaned[column])
    label_encoders[column] = le

# Splitting features and target
X = data_cleaned.drop('Attitude', axis=1)
y = data_cleaned['Attitude']

# Ensure X and y have consistent length
X = X.loc[y.index]

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)

# Use feature importances from the decision tree to rank features
importances = clf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print all features ranked by their importance
print("Feature rankings in descending order of influence:")
print(feature_importance_df)

# Select the top 3 features
top_3_features = feature_importance_df['Feature'].head(3).tolist()

print("Top 3 features:", top_3_features)


Feature rankings in descending order of influence:
     Feature  Importance
3     Income    0.210101
6      Party    0.165702
5   Religion    0.149855
4     Region    0.145295
1  Education    0.136898
2        Age    0.126526
0       Race    0.065624
Top 3 features: ['Income', 'Party', 'Religion']


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the datasets
customers_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagements_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

# Merge the datasets on customer_id
merged_data = customers.merge(transactions, on='customer_id', how='left') \
                       .merge(engagements, on='customer_id', how='left') \
                       .merge(marketing, on='customer_id', how='left')

# Handle missing values
median_age = merged_data['age'].median()
merged_data['age'].fillna(median_age, inplace=True)

most_frequent_gender = merged_data['gender'].mode()[0]
merged_data['gender'].fillna(most_frequent_gender, inplace=True)

# Ensure date columns are in datetime format
merged_data['join_date'] = pd.to_datetime(merged_data['join_date'])
merged_data['last_purchase_date'] = pd.to_datetime(merged_data['last_purchase_date'])

# Calculate customer lifespan in months
merged_data['customer_lifespan_months'] = (merged_data['last_purchase_date'] - merged_data['join_date']).dt.days / 30

# Calculate transactions per month
merged_data['transactions_per_month'] = merged_data['total_transactions'] / merged_data['customer_lifespan_months']

# Calculate average spending per month
merged_data['average_spending_per_month'] = merged_data['avg_transaction_amount'] * merged_data['transactions_per_month']

# Calculate the new CLV
merged_data['CLV'] = merged_data['average_spending_per_month'] * merged_data['customer_lifespan_months']

# Drop rows with missing CLV
merged_data = merged_data.dropna(subset=['CLV'])

# Select relevant features including gender
X = merged_data[['age', 'gender', 'total_transactions', 'avg_transaction_amount', 'recency', 'number_of_site_visits', 'number_of_emails_opened', 'number_of_clicks']]
X = pd.get_dummies(X, columns=['gender'], drop_first=True)

y = merged_data['CLV']

# Split the data into training, testing, and prediction sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_predict, y_test, y_predict = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Predict on the prediction set
y_pred_future = model.predict(X_predict)

# Print the results
print(f'MAE: {mae}, RMSE: {rmse}, R2: {r2}')
print(f'Future Predictions: {y_pred_future[:10]}')  # Display the first 10 predictions for the prediction set


NameError: name 'customers' is not defined