# Extração de caracteristicas - studentInfo

Esse notebook tem como objetivo realizar uma analise exploratória da tabela *studentInfo* que é uma das mais importantes para entender o engajamento do aluno dentro do curso.

In [7]:
!pip install -qqq pandas_profiling

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from missingno import matrix

mpl.rcParams['figure.dpi'] = 200
sns.set_style("darkgrid")

In [8]:
arquivos = glob.glob("../../data/raw/*.csv")
arquivos

['../../data/raw/studentAssessment.csv',
 '../../data/raw/studentInfo.csv',
 '../../data/raw/studentVle.csv',
 '../../data/raw/courses.csv',
 '../../data/raw/vle.csv',
 '../../data/raw/studentRegistration.csv',
 '../../data/raw/assessments.csv']

In [9]:
data = {
    a.split('/')[-1][:-4]: pd.read_csv(a)
    for a in arquivos
}
data.keys()

dict_keys(['studentAssessment', 'studentInfo', 'studentVle', 'courses', 'vle', 'studentRegistration', 'assessments'])

## Modelo dos dados

![](../../images/model.png)

### Tabela *studentInfo*

In [6]:
data['studentInfo']

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail
32589,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction
32590,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass
32591,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn


In [22]:
studentAssessmentFeatures

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totalAssessmentsTaken,meanTMAScore,meanCMAScore,meanExamScore,submittedLate,totalBanked
code_module,code_presentation,id_student,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAA,2014J,6516,5.0,61.800000,,,0.0,0.0
DDD,2013J,8462,3.0,87.666667,,,1.0,0.0
DDD,2014J,8462,4.0,86.500000,,,0.0,4.0
AAA,2013J,11391,5.0,82.000000,,,0.0,0.0
BBB,2013B,23629,4.0,65.000000,100.000000,,3.0,0.0
...,...,...,...,...,...,...,...,...
AAA,2013J,2698257,5.0,67.800000,,,2.0,0.0
CCC,2014B,2698535,4.0,23.000000,24.666667,,3.0,0.0
EEE,2013J,2698535,4.0,54.250000,,,1.0,0.0
BBB,2014J,2698577,5.0,64.400000,,,0.0,0.0


In [23]:
studentVleFeatures

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,homepageClicks,homepageInteractions,homepagePreCourseInteractions,homepagePreCourseInteractionsClicks,forumngClicks,forumngInteractions,forumngPreCourseInteractions,forumngPreCourseInteractionsClicks,glossaryClicks,glossaryInteractions,...,folderPreCourseInteractions,folderPreCourseInteractionsClicks,repeatactivityClicks,repeatactivityInteractions,repeatactivityPreCourseInteractions,repeatactivityPreCourseInteractionsClicks,htmlactivityClicks,htmlactivityInteractions,htmlactivityPreCourseInteractions,htmlactivityPreCourseInteractionsClicks
code_module,code_presentation,id_student,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AAA,2013J,11391,-40,138,1,7,-52,193,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
AAA,2013J,28400,-80,324,7,46,-163,417,18,73,0,0,...,0,0,0,0,0,0,0,0,0,0
AAA,2013J,30268,-12,59,6,32,-30,126,10,39,0,0,...,0,0,0,0,0,0,0,0,0,0
AAA,2013J,31604,-121,432,5,30,-197,634,8,29,-1,1,...,0,0,0,0,0,0,0,0,0,0
AAA,2013J,32885,-68,204,8,52,-77,194,15,73,-3,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGG,2014J,2640965,-5,22,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GGG,2014J,2645731,-35,167,0,0,-14,65,0,0,-2,5,...,0,0,0,0,0,0,0,0,0,0
GGG,2014J,2648187,-22,63,1,6,0,0,0,0,-1,1,...,0,0,0,0,0,0,0,0,0,0
GGG,2014J,2679821,-13,65,2,17,-17,118,5,31,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
keys = ["code_module", "code_presentation", "id_student"]

studentAssessmentFeatures = pd.read_csv('../../data/interim/studentAssessmentFeatures.csv').set_index(keys)
studentVleFeatures = pd.read_csv('../../data/interim/studentVleFeatures.csv', sep=';').set_index(keys)


studentModuleData = data['studentInfo']\
    .set_index(keys)\
    .join(studentAssessmentFeatures)\
    .join(studentVleFeatures)

studentModuleData

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,totalAssessmentsTaken,...,folderPreCourseInteractions,folderPreCourseInteractionsClicks,repeatactivityClicks,repeatactivityInteractions,repeatactivityPreCourseInteractions,repeatactivityPreCourseInteractionsClicks,htmlactivityClicks,htmlactivityInteractions,htmlactivityPreCourseInteractions,htmlactivityPreCourseInteractionsClicks
code_module,code_presentation,id_student,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
studentModuleData.to_csv('../../data/interim/studentModuleData.csv')

In [None]:
from pandas_profiling import ProfileReport

ProfileReport(studentModuleData)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=112.0, style=ProgressStyle(descri…

In [None]:
types = ['homepage', 'forumng', 'glossary', 'oucontent', 'quiz',
       'questionnaire', 'page', 'ouwiki', 'subpage', 'url', 'resource',
       'dualpane', 'ouelluminate', 'dataplus', 'sharedsubpage',
       'oucollaborate', 'externalquiz', 'folder', 'repeatactivity',
       'htmlactivity']

def extract_features(df):
    
    'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result'
    features = {}
    
    for act_type in types:
        total = df.query(f"activity_type == '{act_type}'")
        preCourse = total.query("date < 0")

        features[f"{act_type}Clicks"] = len(total),
        features[f"{act_type}Interactions"] = total.sum_click.sum()
        features[f"{act_type}PreCourseInteractions"] = len(preCourse)
        features[f"{act_type}PreCourseInteractionsClicks"] = preCourse.sum_click.sum()
    
    return pd.Series({
        
    })

features = studentModuleData.groupby("id_student").apply(extract_features)
features

In [None]:
features.to_csv('../../data/interim/studentModuleDataFeatures.csv')