# Extração de caracteristicas - studentVle

Esse notebook tem como objetivo realizar extração de caracteristicas da tabela *studentVle* que é uma das mais importantes para entender o engajamento do aluno dentro do curso.

In [4]:
!pip install -qqq pandas_profiling

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from missingno import matrix

mpl.rcParams['figure.dpi'] = 200
sns.set_style("darkgrid")

In [5]:
arquivos = glob.glob("../../data/raw/*.csv")
arquivos

['../../data/raw/studentAssessment.csv',
 '../../data/raw/studentInfo.csv',
 '../../data/raw/studentVle.csv',
 '../../data/raw/courses.csv',
 '../../data/raw/vle.csv',
 '../../data/raw/studentRegistration.csv',
 '../../data/raw/assessments.csv']

In [7]:
data = {
    a.split('/')[-1][:-4]: pd.read_csv(a)
    for a in arquivos
}
data.keys()

dict_keys(['studentAssessment', 'studentInfo', 'studentVle', 'courses', 'vle', 'studentRegistration', 'assessments'])

## Modelo dos dados

![](../../images/model.png)

### Tabela *studentVle*

In [4]:
data['studentVle']

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1
...,...,...,...,...,...,...
10655275,GGG,2014J,675811,896943,269,3
10655276,GGG,2014J,675578,896943,269,1
10655277,GGG,2014J,654064,896943,269,3
10655278,GGG,2014J,654064,896939,269,1


In [13]:
studentVle = data['studentVle'].set_index("id_site").join(
    data['vle'].drop(columns=['code_module', 'code_presentation']).set_index("id_site"), 
)
studentVle

Unnamed: 0_level_0,code_module,code_presentation,id_student,date,sum_click,activity_type,week_from,week_to
id_site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
526721,FFF,2013B,247666,-18,11,homepage,,
526721,FFF,2013B,244114,-18,5,homepage,,
526721,FFF,2013B,243657,-18,16,homepage,,
526721,FFF,2013B,243600,-18,31,homepage,,
526721,FFF,2013B,244816,-18,11,homepage,,
...,...,...,...,...,...,...,...,...
1046866,CCC,2014J,587391,258,1,resource,,
1046866,CCC,2014J,2598961,264,1,resource,,
1046866,CCC,2014J,621605,266,1,resource,,
1049562,CCC,2014J,338731,238,1,oucontent,,


In [18]:
studentVle.activity_type.unique()

array(['homepage', 'forumng', 'glossary', 'oucontent', 'quiz',
       'questionnaire', 'page', 'ouwiki', 'subpage', 'url', 'resource',
       'dualpane', 'ouelluminate', 'dataplus', 'sharedsubpage',
       'oucollaborate', 'externalquiz', 'folder', 'repeatactivity',
       'htmlactivity'], dtype=object)

In [19]:
types = ['homepage', 'forumng', 'glossary', 'oucontent', 'quiz',
       'questionnaire', 'page', 'ouwiki', 'subpage', 'url', 'resource',
       'dualpane', 'ouelluminate', 'dataplus', 'sharedsubpage',
       'oucollaborate', 'externalquiz', 'folder', 'repeatactivity',
       'htmlactivity']

def extract_features(df):

    features = {}
    
    for act_type in types:
        total = df.query(f"activity_type == '{act_type}'")
        preCourse = total.query("date < 0")

        features[f"{act_type}Clicks"] = len(total),
        features[f"{act_type}Interactions"] = total.sum_click.sum()
        features[f"{act_type}PreCourseInteractions"] = len(preCourse)
        features[f"{act_type}PreCourseInteractionsClicks"] = preCourse.sum_click.sum()
    
    return pd.Series(features)

features = studentVle.groupby(["code_module", "code_presentation", "id_student"]).apply(extract_features)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,homepageClicks,homepageInteractions,homepagePreCourseInteractions,homepagePreCourseInteractionsClicks,forumngClicks,forumngInteractions,forumngPreCourseInteractions,forumngPreCourseInteractionsClicks,glossaryClicks,glossaryInteractions,...,folderPreCourseInteractions,folderPreCourseInteractionsClicks,repeatactivityClicks,repeatactivityInteractions,repeatactivityPreCourseInteractions,repeatactivityPreCourseInteractionsClicks,htmlactivityClicks,htmlactivityInteractions,htmlactivityPreCourseInteractions,htmlactivityPreCourseInteractionsClicks
code_module,code_presentation,id_student,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AAA,2013J,11391,"(40,)",138,1,7,"(52,)",193,2,2,"(0,)",0,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
AAA,2013J,28400,"(80,)",324,7,46,"(163,)",417,18,73,"(0,)",0,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
AAA,2013J,30268,"(12,)",59,6,32,"(30,)",126,10,39,"(0,)",0,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
AAA,2013J,31604,"(121,)",432,5,30,"(197,)",634,8,29,"(1,)",1,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
AAA,2013J,32885,"(68,)",204,8,52,"(77,)",194,15,73,"(3,)",4,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGG,2014J,2640965,"(5,)",22,1,2,"(0,)",0,0,0,"(0,)",0,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
GGG,2014J,2645731,"(35,)",167,0,0,"(14,)",65,0,0,"(2,)",5,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
GGG,2014J,2648187,"(22,)",63,1,6,"(0,)",0,0,0,"(1,)",1,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0
GGG,2014J,2679821,"(13,)",65,2,17,"(17,)",118,5,31,"(0,)",0,...,0,0,"(0,)",0,0,0,"(0,)",0,0,0


In [20]:
# features.to_csv("../../data/interim/studentVleFeatures.csv")