In [None]:
import json
import pandas as pd
import Levenshtein as lv
import numpy as np
import os
import pickle
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
def measure_time_distance(timedelta):
    """
    Function that convert timedelta into seconds
    """
    distance = (timedelta.total_seconds())
    return distance

def measure_text_distance(x,y):
    """
    Function that calculate the Levenshtein distance
    between two submissions code.
    https://en.wikipedia.org/wiki/Levenshtein_distance
    """
    return lv.distance(x,y)

def calculate_distribution(df, column_name, exercise, mean=True):
    """
    df: Dataframe with submissions
    column_name: column with status

    Function to obtain distribution of submissions status
    return metrics and amount of submissions
    """
    total_amount_submissions = df.shape[0]
    submissions_grouped = df.groupby([column_name]).size()
    metrics = {}
    if mean:
        metrics = submissions_grouped/total_amount_submissions
    else:
        metrics = submissions_grouped
    metrics['exercise'] = str(exercise)
    metrics['submission_amount'] = total_amount_submissions
    return metrics

def shift_columns(df, shift_exercise=False):
    """
    Function that create 4 new columns
    shifted column contains submissions_content shifted to calculate the distance
    datetime_shifted contains datetime shifted to calculate the distance between times
    
    distance, Levenshtein distance between submissions
    time_dist, time distance between submissions in seconds
    """
    df.loc[:,'shifted'] = df['submissions_content'].shift().fillna(value='')
    df.loc[:,'guide.name.previous'] = df['guide.name'].shift().fillna(value='')
    df.loc[:,'datetime_shifted'] = df['datetime'].shift().fillna(value=pd.Timestamp(1800, 1, 1, 0))
    df.loc[:,'student.email.previous'] = df['student.email'].shift().fillna(value='')
    df['distance'] = df.apply(lambda x : measure_text_distance(x['submissions_content'],x['shifted']), axis=1)
    df['time_dist'] = df.apply(lambda x : measure_time_distance(x['datetime']-x['datetime_shifted']), axis=1)
    if shift_exercise:
        df.loc[:,'exercise.eid.previous'] = df['exercise.eid'].shift().fillna(value='')
        df.loc[:,'exercise.name.previous'] = df['exercise.name'].shift().fillna(value='')
        df.loc[:,'submissions_status.previous'] = df['submissions_status'].shift().fillna(value='')
        df.loc[:,'submissions_count.previous'] = df['submissions_submissions_count'].shift().fillna(value='')
        df['switch_exercise'] = df.apply(lambda x : x['exercise.name'] != x['exercise.name.previous'], axis=1)
    return df


In [None]:
dataset_home = "datasets/"
files = [
    #'2016-2c-introalgo-2c-2016.json', '2017-2c-introalgo-2c-2017.json',
    '2018-2c-introalgo-2018-2c-23Octubre.json',
]

for file in files:
    json_file = open(dataset_home+str(file), encoding='utf8')
    json_data = json.load(json_file)
    
    submissions = pd.io.json.json_normalize(
        json_data, 'submissions', [['guide','slug'], ['student','email'],['guide','name'], ['exercise','name'],['exercise','eid'],['guide','language','name']], record_prefix='submissions_', errors='ignore')
    submissions['datetime'] = pd.to_datetime(submissions['submissions_created_at'])
    submissions = submissions[~submissions['submissions_content'].isnull()]
    submissions = submissions[~(submissions['submissions_status']=='aborted')]
    #submissions = submissions[~(submissions['submissions_status']=='passed')]
    #submissions = submissions[~(submissions['submissions_status']=='passed_with_warnings')]
    submissions = submissions[submissions['guide.language.name'] == 'haskell']
    submissions = submissions.drop(
        submissions[(submissions['exercise.name'] == 'cifrasBinarias') | 
                    (submissions['exercise.name'] == 'ciclar') | 
                    (submissions['exercise.name'] == 'cifrasBase')].index)
    submissions = submissions.drop(
    submissions[(submissions['student.email'] == 'walteralini@gmail.com') |
                (submissions['student.email'] == 'romina.altamirano@gmail.com') |
                (submissions['student.email'] == 'kouichicruz@gmail.com')].index)
    submissions = submissions.drop(
        submissions[(submissions['student.email'] == 'ismaelpeker@gmail.com') &
                (submissions['submissions_created_at'].str.contains('2017'))].index)

In [None]:
calculate_distribution(submissions, "submissions_status", "todos", False)

In [None]:
exercises_attempted = 0
for student in submissions['student.email'].unique():
    exercises_attempted += submissions[submissions['student.email'] == student]['exercise.name'].nunique()
exercises_attempted

In [None]:
submissions = submissions.sort_values(['student.email', 'datetime'])
submissions = shift_columns(submissions, True)

## Abandono por cambio de ejercicio

In [None]:
submissions[(submissions['switch_exercise']) 
            & ((submissions['submissions_status.previous'] == 'failed') | (submissions['submissions_status.previous'] == 'errored'))
           & (submissions['student.email'] == submissions['student.email.previous'])][
    ['exercise.name.previous', 'submissions_status.previous', 'time_dist',
     'switch_exercise', 'submissions_status', 'student.email.previous',
     'student.email', 'exercise.name', 'datetime','datetime_shifted']].iloc[1:]

## Abandono por cambio de Sesion

In [None]:
threshold = 454
submissions[(submissions['switch_exercise'] == False) 
            & ((submissions['submissions_status.previous'] == 'failed') | (submissions['submissions_status.previous'] == 'errored'))
            & (submissions['time_dist'] > threshold)
            & (submissions['student.email'] == submissions['student.email.previous'])
          ][['exercise.name.previous', 'submissions_status.previous', 'time_dist',
     'switch_exercise', 'submissions_status', 'student.email.previous',
     'student.email', 'exercise.name', 'datetime']].iloc[1:]

## Anoto el dataset
En la columna dropout_switch se anota con True los abandonos por cambio de ejercicio
En la columna dropout_session se anota con True los abandonos por session

In [None]:
submissions['dropout_switch'] = submissions.apply(
    lambda x: True if (
        (x['switch_exercise']) &
        ((x['submissions_status.previous'] == 'failed') | (x['submissions_status.previous'] == 'errored')) & 
        (x['student.email'] == x['student.email.previous'])
    ) else False, axis=1)

In [None]:
submissions['dropout_session'] = submissions.apply(
    lambda x: True if(
        (x['switch_exercise'] == False)
        & ((x['submissions_status.previous'] == 'failed') | (x['submissions_status.previous'] == 'errored'))
        & (x['time_dist'] > 454)
        & (x['student.email'] == x['student.email.previous'])
    ) else False, axis=1)

In [None]:
df = submissions
#df = df[df['student.email'] == df['student.email.previous']]

# Anotate dataset 2da forma

In [None]:
## Only for view
#submissions[submissions['dropout_switch']][['student.email', 'student.email.previous', 'switch_exercise', 'submissions_status.previous', 'exercise.name.previous', 'submissions_status', 'exercise.name', 'time_dist']]
#submissions[submissions['dropout_session']][['student.email', 'student.email.previous', 'switch_exercise', 'submissions_status.previous', 'exercise.name.previous', 'submissions_status', 'exercise.name', 'time_dist']]
test = pd.DataFrame()
df['dropout_session_tray'] = False
df['dropout_switch_tray'] = False
df = df.sort_values(['datetime_shifted', 'student.email.previous'], ascending=False)

for student in df['student.email.previous'].unique():
    for exercise_name in df[(df['student.email.previous'] == student) & (df['dropout_switch'] | df['dropout_session'])]['exercise.name.previous'].unique():
    #for exercise_name in df[(df['student.email.previous'] == student) & ((df['dropout_session']) | (df['dropout_switch']))]['exercise.name.previous'].unique()[:1]:
        exer_df = df[(df['exercise.name.previous'] == exercise_name) & (df['student.email.previous'] == student)]
        flag_session = False
        flag_switch = False
        for index, row in df[(df['exercise.name.previous'] == exercise_name) & (df['student.email.previous'] == student)].iterrows():
            if (flag_session and not(row['dropout_switch'])):
                #exer_df['dropout_session_tray'].iloc[index] = True
                df.at[index, 'dropout_session_tray'] = True
            if((row['dropout_session'] or flag_session) and not(row['dropout_switch'])):
                flag_session = True
                #exer_df['dropout_session_tray'].iloc[index] = True
                df.at[index, 'dropout_session_tray'] = True
            else:
                flag_session = False

            if (flag_switch and not(row['dropout_session'])):
                #exer_df['dropout_switch_tray'].iloc[index] = True
                df.at[index, 'dropout_switch_tray'] = True
            if ((row['dropout_switch'] or flag_switch) and not(row['dropout_session'])):
                flag_switch = True
                #exer_df['dropout_switch_tray'].iloc[index] = True
                df.at[index, 'dropout_switch_tray'] = True
            else:
                flag_switch = False
        test = test.append(exer_df, ignore_index=True)
#df[['switch_exercise', 'dropout_switch', 'dropout_session', 'dropout_session_tray', 'dropout_switch_tray']]

In [None]:
cant_soluciones = df.shape[0]
cant_switch = df[df['dropout_switch']].shape[0]
cant_session = df[df['dropout_session']].shape[0]

cant_switch_tray = df[df['dropout_switch_tray']].shape[0]
cant_session_tray = df[df['dropout_session_tray']].shape[0]

print("Cantidad submissions {}".format(cant_soluciones))

print("1era forma de anotar (solo la ultima submissions)")
print("Cantidad de dropout session {}. Proporcion respecto total submissions {:.3f}".format(cant_session, cant_session/cant_soluciones))
print("Cantidad de dropout switchs {}. Proporcion respecto total submissions {:.3f} ".format(cant_switch, cant_switch/cant_soluciones))
print("Considerando ambos tipos como el mismo cantidad {}  proporcion {:.3f}".format(cant_session + cant_switch,(cant_session + cant_switch)/cant_soluciones))

print("\n2da forma de anotar (anoto trayectoria)")
print('Cantidad de dropout session {}. Proporcion respecto total submissions {:.3f}'.format(cant_session_tray, cant_session_tray/cant_soluciones))
print('Cantidad de dropout switch {}. Proporcion respecto total submissions {:.3f}'.format(cant_switch_tray, cant_switch_tray/cant_soluciones))
print("Considerando ambos tipos como el mismo cantidad {}  proporcion {:.3f}".format(cant_session_tray + cant_switch_tray,(cant_session_tray + cant_switch_tray)/cant_soluciones))


#test[test['student.email.previous'] == 'elmaxisantillan@gmail.com'][['student.email.previous','datetime', 'datetime_shifted', 'exercise.name', 'submissions_status', 'exercise.name.previous','submissions_status.previous' ,'time_dist','switch_exercise', 'dropout_switch','dropout_switch_tray', 'dropout_session', 'dropout_session_tray']]

In [None]:
df.columns

In [None]:
submissions_df = df[['shifted', 'guide.name.previous', 'datetime_shifted', 'student.email.previous',
    'distance', 'time_dist', 'exercise.eid.previous', 'exercise.name.previous',
    'submissions_status.previous', 'submissions_count.previous', 'switch_exercise',
    'dropout_switch', 'dropout_session', 'dropout_session_tray', 'dropout_switch_tray']]
submissions_df = submissions_df.rename(columns={'shifted':'content', 'guide.name.previous':'guide.name', 'datetime_shifted':'datetime', 'student.email.previous': 'student.email',
    'exercise.eid.previous' :'exercise.eid', 'exercise.name.previous': 'exercise.name',
    'submissions_status.previous':'submissions_status', 'submissions_count.previous':'submissions_count'})
submissions_df.to_pickle('introAlgo_final_anotado.pkl')

# Dropout Histogram 

In [None]:
dropout_switch_df = submissions_df.groupby(['student.email', 'dropout_switch_tray']).count().reset_index()
dropout_switch_df = dropout_switch_df[dropout_switch_df['dropout_switch_tray']].sort_values('content',ascending=False)[['student.email', 'content']]
dropout_switch_df.rename(columns={'content': 'dropout_switch_tray'}, inplace=True)
dropout_switch_df

In [None]:
dropout_session_df = submissions_df.groupby(['student.email', 'dropout_session_tray']).count().reset_index()
dropout_session_df = dropout_session_df[dropout_session_df['dropout_session_tray']].sort_values('content',ascending=False)[['student.email', 'content']]
dropout_session_df.rename(columns={'content': 'dropout_session_tray'}, inplace=True)
dropout_session_df

In [None]:
dropouts_df = pd.merge(dropout_switch_df,dropout_session_df, on='student.email')
indexs = np.arange(0,401,20)[::-1]
indexs
#lst_dropouts_switch = [dropouts_df[dropouts_df['']]]

# Split dataset

In [None]:
base = 'datasets/introalgo/'
def train_dev_test(df, proportion=[.8,.9]):
    train_df = pd.DataFrame() 
    dev_df = pd.DataFrame() 
    test_df = pd.DataFrame() 
    exercises_names = df['exercise.name'].unique()
    for exercise in exercises_names:
        df_exer = df[df['exercise.name'] == exercise]
        train, dev  = np.split(df_exer.sample(frac=1), [int(.8*len(df_exer))])
        #print(train.shape, dev.shape, test.shape, exercise)
        train_df = train_df.append(train, ignore_index=True)
        dev_df = dev_df.append(dev, ignore_index=True)
        #test_df = test_df.append(test, ignore_index=True)
    return train_df, dev_df#, test_df

### Load dataframe from pickle

In [None]:
submissions = pd.read_pickle('dataframes_pkl/introAlgo_FINAL_anotado.pkl')
#submissions = submissions[~(submissions['student.email'] == 'ramiromariano.lerda@gmail.com')]

In [None]:
train, dev = train_dev_test(submissions[submissions['exercise.name'] == 'calcular'])
#train, dev, test = train_dev_test(submissions)
dev.shape

In [None]:
train.to_pickle(base+'train801010.pkl')
dev.to_pickle(base+'dev801010.pkl')
test.to_pickle(base+'test801010.pkl')

## Load datasets from pickle

In [None]:
base = 'datasets/introalgo/'
train_df = pd.read_pickle(base+'train801010.pkl')
dev_df = pd.read_pickle(base+'dev801010.pkl')
test_df = pd.read_pickle(base+'test801010.pkl')

## Only for test i make new dataset and split this

In [None]:
#submissions = pd.read_pickle('dataframes_pkl/mumuki_io_FINAL_anotado.pkl')
submissions = pd.read_pickle('dataframes_pkl/introAlgo_FINAL_anotado.pkl')

# Expertise Dimension
## PSA and PCA calculate

In [None]:
submissions['PSA'] = 0
submissions['PCA'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students, desc='students'):
    #calculo ejercicios terminados en verde para calcular promedio sin aplazos
    exercises_passed = submissions[(submissions['student.email'] == student) & (submissions['submissions_status'] == 'passed')]['exercise.name'].unique()
    #cantidad de ejercicios pasados
    amount_exer_passed = len(exercises_passed)
    #cantidad de soluciones enviadas por estudiante
    amount_submissions = submissions[submissions['student.email'] == student].shape[0]
    #cantidad de ejercicios intentados
    amount_exercises_attempted = submissions[submissions['student.email'] == student]['exercise.name'].nunique()
    cant_sol_acum = 0
    for exercise in exercises_passed:
        cant_soluciones = submissions[(submissions['student.email'] == student) & (submissions['exercise.name'] == exercise)].shape[0]
        cant_sol_acum += cant_soluciones
    if cant_sol_acum > 0:
        submissions.loc[submissions['student.email'] == student, ['PSA']] = amount_exer_passed / cant_sol_acum
    if amount_submissions > 0:
        submissions.loc[submissions['student.email'] == student, ['PCA']] = amount_exercises_attempted / amount_submissions

# PCA Mejorado

Ejercicios no pasados (Enp): $Enp = Ei - Ec$

Soluciones no exitosas (Sne): Sean $\left \{  s_{1}, .. ,s_{n}\right \}$ la cantidad soluciones enviadas por el mismo usuario para los ejercicios {{e_{1}, .. , e_{m}} cuya finalización no fue exitosa.

Promedio de error (Pde): $\frac{Enp} {\sum({s_1{}, .., s_{m}})} $
Luego de haber realizado estas definiciones podemos definir al promedio con aplazos mejorado de la siguiente forma
$$PCAM = PSA - (1 - Pde)$$



In [None]:
submissions['PCAM'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students, desc='students'):
    psa_student = submissions[(submissions['student.email'] == student)]['PSA'].values[0]
    amount_submissions_not_passed = 0
    exercises_attempted = set(submissions[submissions['student.email'] == student]['exercise.name'].unique())
    exercises_passed = set(submissions[
        (submissions['student.email'] == student) & (submissions['submissions_status'] == 'passed')]['exercise.name'].unique())
    exercises_not_passed = list(exercises_attempted.difference(exercises_passed))
    for exercise in exercises_not_passed:
        amount_submissions_not_passed += submissions[
            (submissions['exercise.name'] == exercise) & (submissions['student.email'] == student)].shape[0]
    try:
        promedio_de_error = len(exercises_not_passed) / amount_submissions_not_passed
    except:
        promedio_de_error = 1
    pcam = (1 - promedio_de_error) - psa_student
    if pcam < 0:
        pcam = 0
    submissions.loc[submissions['student.email'] == student, ['PCAM']] = pcam


# Dropout Dimension
## Dropout proportion / proporcion de abandonos

In [None]:
submissions['PA'] = 0
submissions['EA'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students, desc='students'):
    #cantidad de soluciones marcadas como abandono
    dropout_amount = submissions[(submissions['student.email'] == student) & (submissions['dropout_tray'])].shape[0]
    #cantidad de soluciones del estudiante
    amount_submissions = submissions[(submissions['student.email'] == student)].shape[0]
    
    #cantidad de ejercicios abandonados
    exercises_dropped = submissions[(submissions['student.email'] == student) & (submissions['dropout'])]['exercise.name'].nunique()
    #cantidad de ejercicios intentados
    exercises_attempted = submissions[submissions['student.email'] == student]['exercise.name'].nunique()
    
    #Proporcion de ejercicios abandonados
    submissions.loc[submissions['student.email'] == student, ['EA']] = exercises_dropped / exercises_attempted
    
    #proporcion de soluciones marcadas como abandono
    submissions.loc[submissions['student.email'] == student, ['PA']] = dropout_amount / amount_submissions
    

### Proporcion Ponderada de Abandono (PPA)
Sean $\left \{  e_{1}, .. ,e_{n}\right \}$ los ejercicios abandonados por un estudiante x, es decir aquellos ejercicios con al menos una solución marcada como abandono. Sean $\left \{a_{1}e_{1},.,a_{m}e_{n}\right \}$ la cantidad de soluciones consideradas como abandono por ejercicio y sean $\left \{s_{1}e_{1},.., s_{n}e_{m}\right \}$ la cantidad total de soluciones por ejercicio. Definimos esta métrica como: 
$$ \frac {\sum(\frac{a_{1}e_{1}} {s_{1}e_{1}},..,\frac{a_{n}e_{n}} {s_{n}e_{n}})} {n} $$


In [None]:
submissions['PPA'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students):
    exercises_dropped_out = submissions[(submissions['dropout']) & (submissions['student.email'] == student)]['exercise.name'].unique()
    amount_exercises_dropped_out = len(exercises_dropped_out)
    acumulator = 0
    for exercise in exercises_dropped_out:
        submissions_amount_exer = submissions[(submissions['exercise.name'] == exercise) & (submissions['student.email'] == student)].shape[0]
        submission_dropped = submissions[(submissions['exercise.name'] == exercise) & (submissions['student.email'] == student) & (submissions['dropout_tray'])].shape[0]
        acumulator += submission_dropped / submissions_amount_exer
    if amount_exercises_dropped_out == 0:
        wn = 0
    else:
        wn = acumulator / amount_exercises_dropped_out
    submissions.loc[submissions['student.email'] == student, ['PPA']] = wn

# Nivel Intensidad
- Promedio de tiempo entre soluciones (PTT)
- Promedio de distancia de edición entre soluciones (PDL)

In [None]:
submissions['PTT'] = 0
submissions['PDL'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students):
    ptt = submissions[(submissions['student.email'] == student) &
            (submissions['time_dist'] <= 454) & (submissions['time_dist'] >0)]['time_dist'].mean()
    pdl = submissions[(submissions['student.email'] == student) & (submissions['switch_exercise'] == False)]['distance'].mean()
    submissions.loc[submissions['student.email'] == student, ['PTT']] = ptt
    submissions.loc[submissions['student.email'] == student, ['PDL']] = pdl

# Insistencia ponderada por abandono (IPA)
Con el objetivo de intentar capturar cuán insistente es el estudiante x cada vez que comienza a resolver un ejercicio construimos la siguiente característica. Sean  $\left \{  e_{1}, .. ,e_{n}\right \}$ los ejercicios que intento resolver un estudiante x. Sean,  $\left \{ t_{1}, .. ,t_{m}\right \}$ la cantidad de veces que hayan terminando en estado de finalización (exitoso o abandono) para cada ejercicio. Sean  $\left \{  s_{1}, .. ,s_{l}\right \}$ la cantidad de soluciones realizadas por cada vez que intento resolverlo llegando a ese estado de finalización por ejercicio. Definimos nuestra métrica como:
$$ \frac{\sum{\frac{t_{1}}{s_{1}} .... \frac{t_{m}} {s_{l}}}}{n} $$ 

In [None]:
submissions['IPA'] = 0
students = submissions['student.email'].unique()
for student in tqdm(students):
    exercises_attempted = submissions[(submissions['student.email'] == student)]['exercise.name'].unique()
    acumulator = 0
    for exercise in exercises_attempted:
        exer_df = submissions[(submissions['student.email'] == student) & (submissions['exercise.name'] == exercise)]
        amount_dropouts = exer_df[exer_df['dropout']].shape[0]
        passed_amount = exer_df[exer_df['submissions_status'] == 'passed'].shape[0]
        if passed_amount > 0:
            total_attempt = amount_dropouts + 1
        else:
            total_attempt = amount_dropouts
        submissions_by_exer = exer_df.shape[0]
        acumulator += total_attempt / submissions_by_exer
        #print('student {} exercise {}'.format(student, exercise))
        #print('attempts {} submission by exer {} acumulator {} \n'.format(total_attempt, submissions_by_exer, acumulator))
    wn = acumulator / len(exercises_attempted)
    #print('================= {} ============='.format(wn))
    submissions.loc[submissions['student.email'] == student, ['IPA']] = wn
    

# Dimension Ejercicio
## Dificultad

### Promedio de cantidad soluciones para aprobar(PCSA)

In [None]:
submissions['PCSA'] = 0
exercises = submissions['exercise.name'].unique()
for exercise in exercises:
    submissions.loc[submissions['exercise.name'] == exercise, ['PCSA']] = submissions[
        (submissions['exercise.name'] == exercise) & (submissions['submissions_status'] == 'passed')]['submissions_count'].mean()

### Cantidad de abandonos por ejercicio (APE)

In [None]:
submissions['APE'] = 0
exercises = submissions['exercise.name'].unique()
for exercise in exercises:
    exer_df = submissions[submissions['exercise.name'] == exercise]
    submissions_amount = exer_df.shape[0]
    dropout_amount = exer_df[exer_df['dropout_tray']].shape[0]
    submissions.loc[submissions['exercise.name'] == exercise, ['APE']] = dropout_amount/submissions_amount

### Cantidad de Abandonos por estudiante (CAPE)
Cantidad de Abandonos por estudiante: número de estudiantes que abandonaron el ejercicio / cantidad de soluciones marcadas como abandono.

In [None]:
submissions['CAPE'] = 0
exercises = submissions['exercise.name'].unique()
for exercise in exercises:
    exer_df = submissions[submissions['exercise.name'] == exercise]
    students_attempted = exer_df['student.email'].unique()
    dropout_amount = exer_df['dropout_tray'].shape[0]
    if dropout_amount > 0:
        cape = len(students_attempted) / dropout_amount
    else:
        cape = 0
    submissions.loc[submissions['exercise.name'] == exercise, ['CAPE']] = cape

### Completitud (COMP)

In [None]:
submissions['COMP'] = 0
exercises = submissions['exercise.name'].unique()
for exercise in exercises:
    exer_df = submissions[submissions['exercise.name'] == exercise]
    students_attempted = exer_df['student.email'].nunique()
    passed_submissions = exer_df[exer_df['submissions_status'] == 'passed'].shape[0]
    submissions.loc[submissions['exercise.name'] == exercise, ['COMP']] = passed_submissions / students_attempted

## Train set

In [None]:
submissions['length'] = 0
submissions['length'] = submissions.apply(lambda x: len(x['content']), axis=1)

In [None]:
submissions.sort_values('length', ascending=False)['length']
#print(submissions[submissions['length'] == 586]['content'][13957])

In [None]:
submissions = pd.read_pickle('dataframes_pkl/mumuki_io_FINAL_anotado.pkl')
#submissions = pd.read_pickle('dataframes_pkl/introAlgo_FINAL_anotado.pkl')
submissions['PTT'].fillna(value=0, inplace=True)
submissions['PDL'].fillna(value=0, inplace=True)



In [None]:
submissions.columns

In [None]:
train_df, test_df= train_dev_test(submissions)

In [None]:
features = ['content', 'submissions_count','time_dist', 'distance', 'PA', 'EA','PCA','PSA','PCAM', 'PPA', 'PTT','PDL', 'IPA', 'PCSA', 'APE', 'CAPE', 'COMP','exercise.eid']
target = 'dropout_tray'
X_train = train_df[features]
Y_train = train_df[target]

In [None]:
X_test = test_df[features]
Y_test = test_df[target] 

## dev Set

# Probando con DataFrameMapper

In [None]:
mapper = DataFrameMapper([
    #('content', CountVectorizer()),
    (['submissions_count'], None),
    (['PA','APE'], None),
 ])
mapper.fit_transform(X_train)
pipe2 = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(C=1,penalty='l2', solver='liblinear',tol=1e-6, class_weight='balanced',
                         max_iter=int(1e6), warm_start=True, verbose=3))
])

from timeit import default_timer as timer

start = timer()
pipe2.fit(X_train, Y_train)
end = timer()
print(end - start)


In [None]:
from timeit import default_timer as timer

start = timer()
preds = pipe2.predict(X_test)
end = timer()
print(end - start) # Time in seconds, e.g. 5.38091952400282

report = classification_report(Y_test, preds, digits=4)
cm = confusion_matrix(Y_test, preds)
print(report)
p, r, f1, s = precision_recall_fscore_support(preds, Y_test, average='weighted')
print(p, r, f1, s)

In [None]:
X_test.shape

In [None]:
X_all = submissions[features]
Y_all = submissions[target]

In [None]:
from sklearn import metrics
scores = cross_val_score(pipe2, X_all, Y_all, cv=10, n_jobs=-1,  scoring='f1_weighted')
#scores_test = cross_val_score(pipe_dummy, X_all, Y_all, cv=10, scoring='f1_weighted')
print("10-fold-cross-validation {}".format(np.array(scores).mean()))
#print("10-fodl-cross-validation {}".format(np.array(scores_test).mean()))

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
pipe_dummy = Pipeline([
    ('mapper', mapper),
    ('classifier', DummyClassifier())
])


In [None]:
pipe_dummy.fit(X_train, Y_train)

In [None]:
preds = pipe_dummy.predict(X_test)
p, r, f1, s = precision_recall_fscore_support(preds, Y_test, average='weighted')
print(p, r, f1, s)
print(classification_report(Y_test, preds, digits=4))

In [None]:
submissions[submissions['dropout_tray']].shape[0] / submissions.shape[0]