# Multiple Choice Classifier

Purpose of this notebook is to process and train a decision tree classifier for all multiple choice features on users profiles. An extra feature I'd like to explore is explainable AI - would be nice to explain to the user which features effected the resulting match percentage.

In [105]:
import pandas as pd
import json

f = open('data/bumble.json')
profiles = json.load(f)

profiles[0]

# Function to transform the array of objects into a DataFrame
def transform_data_to_dataframe(data):
    # Prepare an empty list to collect rows
    rows = []
    
    # Iterate through each object in the data
    for obj in data:
        # Extract the required information
        row = {
            'age': obj['age'].strip(),
            'isVerified': obj['isVerified'],
            'location_distance': obj['location']['distance'],
            'liked': obj['liked']
        }
        
        # Flatten the attributes into individual columns
        for attr in obj['attributes']:
            row[attr['type']] = attr['value']
            
        # Append the row to the list of rows
        rows.append(row)
    
    # Create a DataFrame from the rows
    df = pd.DataFrame(rows)
    
    return df

df = transform_data_to_dataframe(profiles)
df.head()

Unnamed: 0,age,isVerified,location_distance,liked,height,exercise,gender,intentions,familyPlans,drinking,smoking,cannabis,starSign,religion,Politics,education
0,20,False,~36 km away,False,168 cm,Sometimes,Woman,Don’t know yet,Not sure yet,,,,,,,
1,19,False,~12 km away,False,,Almost never,Woman,,,Rarely,Never,Never,Pisces,Christian,,
2,19,True,~14 km away,True,165 cm,Active,Woman,Don’t know yet,Not sure yet,Socially,Socially,Socially,Leo,Atheist,Left,
3,22,True,,False,170 cm,Almost never,,Don’t know yet,Not sure yet,Socially,Socially,,,,,
4,24,True,,False,170 cm,Active,Woman,,,Socially,Never,,,,,I’m a postgrad


Lets process some of these fields

In [106]:
# reformat location_distance to be a numerical feature
df['location_distance'] = pd.to_numeric(df['location_distance'].str.replace('~', '').str.replace('km away', '').str.strip(), errors='coerce')

# reformat height to be a numerical feature
df['height'] = pd.to_numeric(df['height'].str.replace('cm', '').str.strip(), errors='coerce')

# convert age to type float
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# replace NaN values with mean - only height and age, dont think it will hurt the model much
df['location_distance'].fillna(df['location_distance'].mean(), inplace=True)
df['height'].fillna(df['height'].mean(), inplace=True)

df.head()

Unnamed: 0,age,isVerified,location_distance,liked,height,exercise,gender,intentions,familyPlans,drinking,smoking,cannabis,starSign,religion,Politics,education
0,20,False,36.0,False,168.0,Sometimes,Woman,Don’t know yet,Not sure yet,,,,,,,
1,19,False,12.0,False,169.893805,Almost never,Woman,,,Rarely,Never,Never,Pisces,Christian,,
2,19,True,14.0,True,165.0,Active,Woman,Don’t know yet,Not sure yet,Socially,Socially,Socially,Leo,Atheist,Left,
3,22,True,20.396825,False,170.0,Almost never,,Don’t know yet,Not sure yet,Socially,Socially,,,,,
4,24,True,20.396825,False,170.0,Active,Woman,,,Socially,Never,,,,,I’m a postgrad


In [107]:
df.dtypes

age                    int64
isVerified              bool
location_distance    float64
liked                   bool
height               float64
exercise              object
gender                object
intentions            object
familyPlans           object
drinking              object
smoking               object
cannabis              object
starSign              object
religion              object
Politics              object
education             object
dtype: object

Now lets apply min max scalar to our numerical features

In [108]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

numerical_features = ['age', 'height', 'location_distance']
numerical_df = df[numerical_features]

scaled_numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_features)

for feature in numerical_features:
    df[feature] = scaled_numerical_df[feature]
    
df.head()

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,age,isVerified,location_distance,liked,height,exercise,gender,intentions,familyPlans,drinking,smoking,cannabis,starSign,religion,Politics,education
0,0.142857,False,0.772727,False,0.40625,Sometimes,Woman,Don’t know yet,Not sure yet,,,,,,,
1,0.0,False,0.227273,False,0.465431,Almost never,Woman,,,Rarely,Never,Never,Pisces,Christian,,
2,0.0,True,0.272727,True,0.3125,Active,Woman,Don’t know yet,Not sure yet,Socially,Socially,Socially,Leo,Atheist,Left,
3,0.428571,True,0.41811,False,0.46875,Almost never,,Don’t know yet,Not sure yet,Socially,Socially,,,,,
4,0.714286,True,0.41811,False,0.46875,Active,Woman,,,Socially,Never,,,,,I’m a postgrad


Now lets apply one hot encoding to our textual features

In [109]:
# before we do this, lets convert the boolean features to strings 
textual_cols = df.select_dtypes(include=['object']).columns.tolist()

df_encoded = pd.get_dummies(df, columns=textual_cols)
df_encoded.head()

Unnamed: 0,age,isVerified,location_distance,liked,height,exercise_Active,exercise_Almost never,exercise_Sometimes,gender_Trans woman,gender_Woman,...,Politics_Apolitical,Politics_Left,Politics_Liberal,Politics_Moderate,education_I’m a postgrad,education_I’m an undergrad,education_Postgraduate degree,education_Sixth form,education_Technical college,education_Undergraduate degree
0,0.142857,False,0.772727,False,0.40625,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
1,0.0,False,0.227273,False,0.465431,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,0.0,True,0.272727,True,0.3125,True,False,False,False,True,...,False,True,False,False,False,False,False,False,False,False
3,0.428571,True,0.41811,False,0.46875,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0.714286,True,0.41811,False,0.46875,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False


Cool, now lets train a model

In [110]:
df['liked'].value_counts(normalize=True) * 100

liked
False    75.362319
True     24.637681
Name: proportion, dtype: float64

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Dropping the 'liked' column from features since it's the target variable, and we already encoded it
X = df_encoded.drop(columns=['liked', 'age', 'height', 'location_distance'])
y = df_encoded['liked'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Decision Tree Classifier
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

report = classification_report(y_test, y_pred, target_names=['False', 'True'])
print(report)

              precision    recall  f1-score   support

       False       0.78      0.97      0.86        32
        True       0.50      0.10      0.17        10

    accuracy                           0.76        42
   macro avg       0.64      0.53      0.51        42
weighted avg       0.71      0.76      0.70        42



  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
