In [20]:
#data cleaning

import pandas as pd

# Load the dataset
file_path = '/Users/pritam/Documents/ML Project/Data Collection for ML mini project (Responses) - Form Responses 1.csv'
df = pd.read_csv(file_path)

# Create a mapping dictionary programmatically by matching keywords
column_mapping = {}
for col in df.columns:
    clean_col = col.strip().lower()
    if 'teamwork preference' in clean_col:
        column_mapping[col] = 'teamwork_preference'
    elif 'introversion extraversion' in clean_col:
        column_mapping[col] = 'introversion_extraversion'
    elif 'books read past year' in clean_col:
        column_mapping[col] = 'books_read_past_year'
    elif 'club top1' in clean_col:
        column_mapping[col] = 'club_top1'
    elif 'weekly_hobby_hours' in clean_col:
        column_mapping[col] = 'weekly_hobby_hours'
    elif 'hobby_top1' in clean_col:
        column_mapping[col] = 'hobby_top1'
    elif 'hobby top2' in clean_col:
        column_mapping[col] = 'hobby_top2'
    elif 'club top 2' in clean_col:
        column_mapping[col] = 'club_top2'

# Rename the columns using the created mapping
df.rename(columns=column_mapping, inplace=True)

# Select only the relevant columns based on the new, clean names
relevant_columns = list(column_mapping.values())
df_buddy = df[relevant_columns].copy()

# Display the first few rows to confirm the column names are now correct
print("\nDataFrame after cleaning and selecting columns:")
df_buddy.head()

In [12]:
#Now the columns are correctly selected and renamed, we are now cleaning the numeric data and prepare the categorical features for our machine learning model.

import numpy as np

# Define a function to clean numeric columns with inconsistent data
def clean_numeric_column(series):
    # Convert to string, strip spaces, and replace special characters
    cleaned_series = series.astype(str).str.strip().str.lower().str.replace('cm', '').str.replace('kg', '').str.replace('hrs', '').str.replace('h', '').str.replace(' ', '')
    # For ranges like '9-10', split and take the first number
    cleaned_series = cleaned_series.str.split('-').str[0]
    # Convert to numeric, handle errors by coercing to NaN, and fill NaN with 0
    cleaned_series = pd.to_numeric(cleaned_series, errors='coerce').fillna(0).astype(int)
    return cleaned_series

# Apply the cleaning function to the relevant columns
df_buddy['books_read_past_year'] = clean_numeric_column(df_buddy['books_read_past_year'])
df_buddy['weekly_hobby_hours'] = clean_numeric_column(df_buddy['weekly_hobby_hours'])

# Identify the categorical columns for one-hot encoding
categorical_cols = ['hobby_top1', 'hobby_top2', 'club_top1', 'club_top2']

# Apply One-Hot Encoding to convert categorical features into a numerical format
df_encoded = pd.get_dummies(df_buddy, columns=categorical_cols, prefix=categorical_cols)

# Display the first few rows of the final, encoded DataFrame
print("\nFinal Prepared DataFrame Head:")
df_encoded.head()


Final Prepared DataFrame Head:


Unnamed: 0,books_read_past_year,weekly_hobby_hours,introversion_extraversion,teamwork_preference,hobby_top1_Badminton,hobby_top1_Coding,hobby_top1_Cricket,hobby_top1_Dance,hobby_top1_Debate,hobby_top1_Football,...,club_top1_Robotics Club,club_top1_Sports Club,club_top2_Coding Club,club_top2_Cultural Club,club_top2_Drama Club,club_top2_Entrepreneurship Cell,club_top2_Literary Club,club_top2_Music Club,club_top2_Robotics Club,club_top2_Sports Club
0,0,0,2,3,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,False
1,2,40,3,3,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,5,5,3,3,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,5,10,5,5,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,5,5,3,1,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [13]:
# Feature Scaling
# We are performing feature scaling here as the numerical features (like books_read_past_year) can be very large, we will standardize them

features = df_encoded.copy()
from sklearn.preprocessing import StandardScaler

# Identify numeric columns explicitly
numeric_cols = ['teamwork_preference', 'introversion_extraversion', 'books_read_past_year', 'weekly_hobby_hours']

scaler = StandardScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])


In [14]:
# Similarity Matrix
# Now we will compute similarity between students. Cosine similarity works best here.

from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(features)

print("Similarity matrix shape:", similarity_matrix.shape)


Similarity matrix shape: (111, 111)


In [15]:
# Recommendation Function
# Now create a function that recommends top K buddies for a given student.

def recommend_buddies(student_id, k=5):
    # Get similarity scores for the student
    similarities = list(enumerate(similarity_matrix[student_id]))
    # Sort in descending order
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    # Pick top K (skip self at index 0)
    top_buddies = [i for i, sim in similarities[1:k+1]]
    return top_buddies

# Example: Recommend top 5 buddies for student 0
#print("Recommended buddies for student 0:", recommend_buddies(0, k=5))

In [17]:
import pandas as pd

# ---------- Define relevance between buddies ----------
def get_relevant_buddies(student_id, df):
    relevant = []
    for other_id in range(len(df)):
        if student_id == other_id:
            continue
        same_club = df.loc[student_id, 'club_top1'] == df.loc[other_id, 'club_top1']
        teamwork_close = abs(df.loc[student_id, 'teamwork_preference'] - df.loc[other_id, 'teamwork_preference']) <= 1
        introvert_close = abs(df.loc[student_id, 'introversion_extraversion'] - df.loc[other_id, 'introversion_extraversion']) <= 1

        if same_club or teamwork_close or introvert_close:
            relevant.append(other_id)
    return relevant


# ---------- Precision@K and Recall@K ----------
def precision_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / k if k > 0 else 0

def recall_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / len(relevant) if len(relevant) > 0 else 0


# ---------- Evaluate system for all students ----------
def evaluate_recommendation_system(df, recommend_func, k=5):
    precision_scores = []
    recall_scores = []

    for student_id in range(len(df)):
        recommended = recommend_func(student_id, k)  # your buddy recommender function
        relevant = get_relevant_buddies(student_id, df)

        if len(relevant) == 0:
            continue  # skip if no relevant buddies exist

        precision_scores.append(precision_at_k(recommended, relevant, k))
        recall_scores.append(recall_at_k(recommended, relevant, k))

    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)

    print(f"📊 Evaluation Results (K={k})")
    print(f"Average Precision@{k}: {avg_precision:.3f}")
    print(f"Average Recall@{k}: {avg_recall:.3f}")

    return avg_precision, avg_recall


In [18]:
evaluate_recommendation_system(df_buddy, recommend_buddies, k=5)


📊 Evaluation Results (K=5)
Average Precision@5: 0.995
Average Recall@5: 0.054


(0.9945945945945946, 0.053798040946921304)