# Data Cleaning for Lecturer-Course Matching
This notebook cleans and prepares data for a logistic regression model to match lecturers with courses.

In [23]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Read the CSV file
df = pd.read_csv('data.csv')
print(f"Original shape: {df.shape}")
df.head()

Original shape: (24, 15)


Unnamed: 0,SN,Rating,Institute name,Division,Status,Program Name,Hourly Payment for Lecturer,"Level (Doctorate, Masters, PostGraduate, Bachelors, HND, HNC)",Time Preference,Student Count,Duration in days,Language,Subject Name,Number of Credits,Lecturer Index
0,1,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Commerce,3300,BACHELORS,WEEKEND,160,1095,ENGLISH,Strategic Management,3,1
1,2,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Commerce,3300,BACHELORS,WEEKEND,160,1095,ENGLISH,Human Resource Management,2,1
2,3,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Computer Science,3300,BACHELORS,WEEKEND,40,730,ENGLISH,Object Oriented Analysis and Design,2,2
3,4,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Computer Science,3300,BACHELORS,WEEKEND,40,730,ENGLISH,Object Oriented Programming with C++,2,1
4,5,5,Overseas Campus of Ceylon,Maharagama,ACTIVE,Master of Business Administration in Supply Ch...,2300,MASTERS,WEEKEND,40,730,ENGLISH,Business Statistics,4,1


In [26]:
df.columns

Index(['SN', 'Rating', 'Institute name', 'Division', 'Status', 'Program Name',
       'Hourly Payment for Lecturer',
       'Level (Doctorate, Masters, PostGraduate, Bachelors, HND, HNC)',
       'Time Preference', 'Student Count', 'Duration in days', 'Language',
       'Subject Name', 'Number of Credits', 'Lecturer Index'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,SN,Rating,Institute name,Division,Status,Program Name,Hourly Payment for Lecturer,"Level (Doctorate, Masters, PostGraduate, Bachelors, HND, HNC)",Time Preference,Student Count,Duration in days,Language,Subject Name,Number of Credits,Lecturer Index
0,1,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Commerce,3300,BACHELORS,WEEKEND,160,1095,ENGLISH,Strategic Management,3,1
1,2,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Commerce,3300,BACHELORS,WEEKEND,160,1095,ENGLISH,Human Resource Management,2,1
2,3,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Computer Science,3300,BACHELORS,WEEKEND,40,730,ENGLISH,Object Oriented Analysis and Design,2,2
3,4,5,Edith Cowan University - Sri Lanka,Kotte,ACTIVE,Bachelor of Computer Science,3300,BACHELORS,WEEKEND,40,730,ENGLISH,Object Oriented Programming with C++,2,1
4,5,5,Overseas Campus of Ceylon,Maharagama,ACTIVE,Master of Business Administration in Supply Ch...,2300,MASTERS,WEEKEND,40,730,ENGLISH,Business Statistics,4,1


## Data Cleaning Steps

In [31]:
# 1. Remove duplicate rows
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

Shape after removing duplicates: (24, 15)


In [32]:
# 3. Extract relevant features for the model
relevant_features = [
    'Program Name',
    'Hourly Payment for Lecturer',
    'Level (Doctorate, Masters, PostGraduate, Bachelors, HND, HNC)',
    'Time Preference',
    'Student Count',
    'Subject Name',
    'Number of Credits',
    'Rating',  # Institute rating
    'Lecturer Index',
    'Duration in days',
    'Division',
    'Status',
    'Language'
]

df_cleaned = df[relevant_features].copy()

In [33]:
# Rename columns to shorter names in snake case
column_mapping = {
    'Program Name': 'program',
    'Hourly Payment for Lecturer': 'hourly_pay',
    'Level (Doctorate, Masters, PostGraduate, Bachelors, HND, HNC)': 'level',
    'Time Preference': 'time_pref',
    'Student Count': 'student_count',
    'Subject Name': 'subject',
    'Number of Credits': 'credits',
    'Rating': 'institute_rating',
    'Lecturer Index': 'lecturer_id',  # Changed from 'lecturer' to 'lecturer_id'
    'Duration in days': 'duration',
    'Division': 'division',
    'Status': 'status',
    'Language': 'language'
}
df_cleaned = df_cleaned.rename(columns=column_mapping)


In [34]:
df_cleaned = df_cleaned.drop_duplicates()

In [35]:
df_cleaned.shape

(24, 13)

In [36]:
# 4. Encode categorical variables
label_encoders = {}
categorical_columns = [
    'program',
    'level',
    'time_pref',
    'subject',
    'lecturer_id',
    'division',
    'status',
    'language'
]

for column in categorical_columns:
    # Convert to string and lowercase before encoding
    df_cleaned[column] = df_cleaned[column].astype(str).str.lower()
    label_encoders[column] = LabelEncoder()
    df_cleaned[column] = label_encoders[column].fit_transform(df_cleaned[column])


In [37]:
# 5. Scale numerical features
scaler = StandardScaler()
numerical_columns = [
    'hourly_pay',
    'student_count',
    'credits',
    'institute_rating',
    'duration'
]

df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])

In [38]:
# 7. Save the cleaned data
df_cleaned.to_csv('cleaned_data.csv', index=False)
print("Cleaned data saved to 'cleaned_data.csv'")

Cleaned data saved to 'cleaned_data.csv'


## Data Cleaning Summary

The following steps were performed:
1. Removed duplicate entries
2. Handled missing values
3. Selected relevant features for the model
4. Encoded categorical variables
5. Scaled numerical features
6. Prepared features and target variables
7. Saved the cleaned dataset

The data is now ready for logistic regression modeling.

In [39]:
# At the end of training.ipynb
from save_encoders import save_encoders
save_encoders(label_encoders, scaler)