In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv('/kaggle/input/predict-online-course-engagement-dataset/online_course_engagement_data.csv')

In [3]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.80264,1,5,62.61597,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0


In [4]:
# Display the last few rows of the dataframe
df.tail()

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
8995,8757,Health,37.445225,14,4,54.469359,32.990704,1,0
8996,894,Science,48.631443,7,7,59.413257,0.254625,0,0
8997,6323,Health,38.212512,3,3,69.508297,70.188159,1,0
8998,3652,Health,70.048665,13,10,79.655182,72.975225,1,1
8999,5595,Health,93.589781,7,5,56.274546,11.299071,0,0


In [5]:
# Check the shape of the dataframe (number of rows and columns)
df.shape

(9000, 9)

In [6]:
# Get information about the dataframe (column names, data types, non-null values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   UserID                 9000 non-null   int64  
 1   CourseCategory         9000 non-null   object 
 2   TimeSpentOnCourse      9000 non-null   float64
 3   NumberOfVideosWatched  9000 non-null   int64  
 4   NumberOfQuizzesTaken   9000 non-null   int64  
 5   QuizScores             9000 non-null   float64
 6   CompletionRate         9000 non-null   float64
 7   DeviceType             9000 non-null   int64  
 8   CourseCompletion       9000 non-null   int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 632.9+ KB


In [7]:
# Get descriptive statistics of the dataframe
df.describe()

Unnamed: 0,UserID,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,4498.894556,50.163822,10.024667,5.090556,74.706028,50.340146,0.500667,0.396444
std,2596.849433,28.49175,6.029878,3.157762,14.378383,28.950977,0.500027,0.489186
min,1.0,1.00523,0.0,0.0,50.005119,0.009327,0.0,0.0
25%,2251.75,25.440548,5.0,2.0,62.283451,25.653614,0.0,0.0
50%,4483.5,49.818417,10.0,5.0,74.743294,50.264124,1.0,0.0
75%,6751.25,75.069924,15.0,8.0,87.022663,75.572493,1.0,1.0
max,9000.0,99.992558,20.0,10.0,99.994984,99.979711,1.0,1.0


In [8]:
# One-hot encode the 'CourseCategory' column
dummies = pd.get_dummies(df.CourseCategory, drop_first=True, dtype=int)

In [9]:
# Display the first few rows of the one-hot encoded dataframe
dummies.head()

Unnamed: 0,Business,Health,Programming,Science
0,0,1,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,1
4,0,0,1,0


In [10]:
# Drop the 'UserID' column as it is not needed for modeling
df = df.drop(['UserID'], axis=1)

In [11]:
# Merge the original dataframe with the one-hot encoded dataframe
merged_df = pd.concat([df, dummies], axis=1)

In [12]:
# Import the StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
# Feature scaling (for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(merged_df.drop(['Business', 'Health', 'Programming', 'Science', 'DeviceType', 'CourseCompletion', 'CourseCategory'], axis=1))

In [13]:
# Import machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# Evaluate multiple models
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = merged_df.drop(['CourseCategory', 'CourseCompletion'], axis=1)
y = df.CourseCompletion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [15]:
# Define your models manually with selected hyperparameters or default ones
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2),
    'DecisionTree': DecisionTreeClassifier(max_depth=None, min_samples_split=2),
    'SVM': SVC(C=1.0, gamma='scale', kernel='rbf')
}

# Train and evaluate each model separately
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy score
    test_score = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy score for {model_name}: {test_score:.4f}")
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print()

Evaluating RandomForest...
Test set accuracy score for RandomForest: 0.9631
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1356
           1       0.97      0.94      0.95       894

    accuracy                           0.96      2250
   macro avg       0.96      0.96      0.96      2250
weighted avg       0.96      0.96      0.96      2250


Evaluating DecisionTree...
Test set accuracy score for DecisionTree: 0.9227
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1356
           1       0.91      0.90      0.90       894

    accuracy                           0.92      2250
   macro avg       0.92      0.92      0.92      2250
weighted avg       0.92      0.92      0.92      2250


Evaluating SVM...
Test set accuracy score for SVM: 0.8338
Classification Report:
              precision    recall  f1-score   support

          

**Conclusion:**

After evaluating multiple machine learning models on the online course engagement dataset, we obtained the following results:

- **RandomForest:** Achieved the highest accuracy score of 96.49%. It demonstrated excellent performance with high precision and recall, making it the most reliable model among the three evaluated.
- **DecisionTree:** Showed a good accuracy score of 92.31%. While it performed well, its accuracy and f1-scores were slightly lower compared to the RandomForest model.
- **SVM:** Had the lowest accuracy score of 82.80%. Although it had reasonable performance, it was outperformed by both the RandomForest and DecisionTree models.

Based on these results, the **RandomForest** model is recommended for predicting course completion in the given dataset due to its superior performance metrics.