## Smart E - Learning Framework using Deep Q - Learning

### Importing the Necessary Libraries

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

### Importing the Dataset

In [2]:
training_set = pd.read_csv('./Dataset/Training_Data.csv', parse_dates=['time'])
test_set = pd.read_csv('./Dataset/Test_Data.csv', parse_dates=['time'])

print("Training Set Values: \n", training_set.head())
print("Test Set Values: \n", test_set.head())

Training Set Values: 
    enroll_id  username                              course_id  \
0       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
1       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
2       1735   1003557  course-v1:TsinghuaX+70800232X+2015_T2   
3       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
4        895    119795  course-v1:TsinghuaX+70800232X+2015_T2   

                         session_id      action  \
0  eb08a96b85f9ef51e8ab624447ecc33b  stop_video   
1  eb08a96b85f9ef51e8ab624447ecc33b  stop_video   
2  8fe0f1348594853d460796f498fb7f0e  play_video   
3  eb08a96b85f9ef51e8ab624447ecc33b  stop_video   
4  18f52eaeb8bbddbefe9e58a6efa9ca95  seek_video   

                             object                time  truth  
0  3169d758ee2d4262b07f0113df743c42 2015-10-27 12:24:59      0  
1  3169d758ee2d4262b07f0113df743c42 2015-10-27 13:57:45      0  
2  e8621bd03c2446dcb89548a373177354 2015-10-19 19:09:34      1  
3  3169d758ee2d4262

### Data Preprocessing

#### Finding and removing the features which contains null values

In [3]:
print(training_set.isnull().sum())

training_set.dropna(axis=1, inplace=True)
print(training_set.head())

print(test_set.isnull().sum())

test_set.dropna(axis=1, inplace=True)
print(test_set.head())

enroll_id         0
username          0
course_id         0
session_id        0
action            0
object        13803
time              0
truth             0
dtype: int64
   enroll_id  username                              course_id  \
0       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
1       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
2       1735   1003557  course-v1:TsinghuaX+70800232X+2015_T2   
3       1103    550424  course-v1:TsinghuaX+70800232X+2015_T2   
4        895    119795  course-v1:TsinghuaX+70800232X+2015_T2   

                         session_id      action                time  truth  
0  eb08a96b85f9ef51e8ab624447ecc33b  stop_video 2015-10-27 12:24:59      0  
1  eb08a96b85f9ef51e8ab624447ecc33b  stop_video 2015-10-27 13:57:45      0  
2  8fe0f1348594853d460796f498fb7f0e  play_video 2015-10-19 19:09:34      1  
3  eb08a96b85f9ef51e8ab624447ecc33b  stop_video 2015-10-27 19:15:38      0  
4  18f52eaeb8bbddbefe9e58a6efa9ca95  seek_video 201

#### Encoding the Numerical Data of the Training Set

In [4]:
numerical_training_set = training_set.select_dtypes(include=['int'])

numerical_encoder = MinMaxScaler(feature_range=(0,1))
numerical_training_set = numerical_encoder.fit_transform(numerical_training_set)

print(numerical_encoder.get_feature_names_out())

numerical_training_dataframe = pd.DataFrame(data=numerical_training_set, columns=numerical_encoder.get_feature_names_out())

print(numerical_training_set)
print(numerical_training_dataframe.head())

['enroll_id' 'username' 'truth']
[[0.31285444 0.33480581 0.        ]
 [0.31285444 0.33480581 0.        ]
 [0.91020794 0.6104811  1.        ]
 ...
 [0.70510397 0.6623111  0.        ]
 [0.31285444 0.33480581 0.        ]
 [0.31285444 0.33480581 0.        ]]
   enroll_id  username  truth
0   0.312854  0.334806    0.0
1   0.312854  0.334806    0.0
2   0.910208  0.610481    1.0
3   0.312854  0.334806    0.0
4   0.116257  0.072821    1.0


#### Encoding the Categorical Data of the Training Set

In [5]:
categorical_training_set = training_set.select_dtypes(include=['object'])

categorical_training_set = pd.get_dummies(categorical_training_set, columns=['action'])

encoder = OrdinalEncoder()
categorical_encoded_set = encoder.fit_transform(categorical_training_set)

categorical_encoded_set = numerical_encoder.fit_transform(categorical_encoded_set)

categorical_training_set = pd.DataFrame(categorical_encoded_set, columns=categorical_training_set.columns)
print(categorical_training_set.head())

   course_id  session_id  action_click_about  action_click_courseware  \
0        0.0    0.916424                 0.0                      0.0   
1        0.0    0.916424                 0.0                      0.0   
2        0.0    0.524781                 0.0                      0.0   
3        0.0    0.916424                 0.0                      0.0   
4        0.0    0.103013                 0.0                      0.0   

   action_click_forum  action_click_info  action_click_progress  \
0                 0.0                0.0                    0.0   
1                 0.0                0.0                    0.0   
2                 0.0                0.0                    0.0   
3                 0.0                0.0                    0.0   
4                 0.0                0.0                    0.0   

   action_close_courseware  action_create_comment  action_create_thread  \
0                      0.0                    0.0                   0.0   
1       

#### Encoding the DateTime feature

In [6]:
print(training_set['time'].head())

training_set['datetime'] = training_set['time'].values.astype(np.int64) // 10 ** 9
training_set = training_set.drop(['time'], axis = 1)

training_set['datetime'] = numerical_encoder.fit_transform(pd.DataFrame(training_set['datetime']))
print(training_set['datetime'])

0   2015-10-27 12:24:59
1   2015-10-27 13:57:45
2   2015-10-19 19:09:34
3   2015-10-27 19:15:38
4   2015-09-28 15:31:43
Name: time, dtype: datetime64[ns]
0        0.901615
1        0.903419
2        0.685364
3        0.909604
4        0.092806
           ...   
49995    0.289124
49996    0.928223
49997    0.868127
49998    0.903983
49999    0.912940
Name: datetime, Length: 50000, dtype: float64


#### Combining the encoded features into a single dataframe

In [10]:
encoded_training_set = pd.merge(numerical_training_dataframe, categorical_training_set, left_index=True, right_index=True)
encoded_training_set = pd.merge(encoded_training_set, training_set['datetime'], left_index=True, right_index=True)
print(encoded_training_set.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   enroll_id                50000 non-null  float64
 1   username                 50000 non-null  float64
 2   truth                    50000 non-null  float64
 3   course_id                50000 non-null  float64
 4   session_id               50000 non-null  float64
 5   action_click_about       50000 non-null  float64
 6   action_click_courseware  50000 non-null  float64
 7   action_click_forum       50000 non-null  float64
 8   action_click_info        50000 non-null  float64
 9   action_click_progress    50000 non-null  float64
 10  action_close_courseware  50000 non-null  float64
 11  action_create_comment    50000 non-null  float64
 12  action_create_thread     50000 non-null  float64
 13  action_load_video        50000 non-null  float64
 14  action_pause_video    

#### Splitting the Training Set into two parts

In [8]:
from sklearn.model_selection import train_test_split

xtrain, ytrain = train_test_split(encoded_training_set, train_size=0.5, random_state=42)
print("Training Set - X Train: \n", xtrain.head())
print("Training Set - Y Train: \n", ytrain.head())

Training Set - X Train: 
        enroll_id  username  truth  course_id  session_id  action_click_about  \
25858   0.312854  0.334806    0.0        0.0    0.916424                 0.0   
10784   0.696597  0.653865    0.0        0.0    0.587949                 1.0   
24807   0.988658  0.214788    1.0        0.0    0.150632                 0.0   
49534   0.727788  0.897883    1.0        0.0    0.045675                 0.0   
3345    0.493384  0.307187    0.0        0.0    0.375121                 0.0   

       action_click_courseware  action_click_forum  action_click_info  \
25858                      0.0                 0.0                0.0   
10784                      0.0                 0.0                0.0   
24807                      0.0                 0.0                1.0   
49534                      0.0                 0.0                0.0   
3345                       0.0                 0.0                0.0   

       action_click_progress  action_close_courseware 

### Feature Selection

#### Identifying the Categorical Features and Numerical Features in the Dataframe

In [12]:
numerical_features = training_set.select_dtypes(include=['int', 'float']).columns
print(numerical_features.to_list())

categorical_features = training_set.select_dtypes(include=['object']).columns
print(categorical_features.to_list())

['enroll_id', 'username', 'truth', 'datetime']
['course_id', 'session_id', 'action']
