In [None]:
import json
import pandas as pd
import Levenshtein as lv
import numpy as np
import os
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [None]:
def measure_time_distance(timedelta):
    """
    Function that convert timedelta into seconds
    """
    distance = (timedelta.total_seconds())
    return distance

def measure_text_distance(x,y):
    """
    Function that calculate the Levenshtein distance
    between two submissions code.
    https://en.wikipedia.org/wiki/Levenshtein_distance
    """
    return lv.distance(x,y)

def calculate_distribution(df, column_name, exercise, mean=True):
    """
    df: Dataframe with submissions
    column_name: column with status

    Function to obtain distribution of submissions status
    return metrics and amount of submissions
    """
    total_amount_submissions = df.shape[0]
    submissions_grouped = df.groupby([column_name]).size()
    metrics = {}
    if mean:
        metrics = submissions_grouped/total_amount_submissions
    else:
        metrics = submissions_grouped
    metrics['exercise'] = str(exercise)
    metrics['submission_amount'] = total_amount_submissions
    return metrics

def shift_columns(df, shift_social=False):
    """
    Function that create 4 new columns
    shifted column contains submissions_content shifted to calculate the distance
    datetime_shifted contains datetime shifted to calculate the distance between times
    
    distance, Levenshtein distance between submissions
    time_dist, time distance between submissions in seconds
    """
    df.loc[:,'shifted'] = df['submissions_content'].shift().fillna(value='')
    df.loc[:,'guide.name.previous'] = df['guide.name'].shift().fillna(value='')
    df.loc[:,'datetime_shifted'] = df['datetime'].shift().fillna(value=pd.Timestamp(1800, 1, 1, 0))
    df.loc[:,'student.email.previous'] = df['student.email'].shift().fillna(value='')
    df['distance'] = df.apply(lambda x : measure_text_distance(x['submissions_content'],x['shifted']), axis=1)
    df['time_dist'] = df.apply(lambda x : measure_time_distance(x['datetime']-x['datetime_shifted']), axis=1)
    df.loc[:,'exercise.eid.previous'] = df['exercise.eid'].shift().fillna(value='')
    df.loc[:,'exercise.name.previous'] = df['exercise.name'].shift().fillna(value='')
    df.loc[:,'submissions_status.previous'] = df['submissions_status'].shift().fillna(value='')
    df.loc[:,'submissions_count.previous'] = df['submissions_submissions_count'].shift().fillna(value='')
    df['switch_exercise'] = df.apply(lambda x : x['exercise.name'] != x['exercise.name.previous'], axis=1)
    if shift_social:
        df['submitter.social_id.previous'] = df['submitter.social_id'].shift().fillna(value='')
        
    return df


# Load dataset and replace null value in email by submitter.social_id

In [None]:
base = 'dataframes_pkl/'
df = pd.read_pickle(base + 'failed_submissions_df_FINAL.pkl')
df.loc[df['student.email'].isnull() , 'student.email'] = df['submitter.social_id']

In [None]:
submissions = df

# calculate distribution

In [None]:
calculate_distribution(submissions, "submissions_status", "todos", False)

# Amount unique students

In [None]:
submissions['student.email'].nunique()

# Exercises attempted

In [None]:
exercises_attempted = 0
for student in submissions['student.email'].unique():
    exercises_attempted += submissions[submissions['student.email'] == student]['exercise.name'].nunique()
exercises_attempted

# Shift columns

In [None]:
submissions = submissions.sort_values(['student.email', 'datetime'])
submissions = shift_columns(submissions, True)

In [None]:
submissions.columns

# Calculate Switching exercise Dropout

In [None]:
submissions[(submissions['switch_exercise']) 
            & ((submissions['submissions_status.previous'] == 'failed') | (submissions['submissions_status.previous'] == 'errored'))
           & (submissions['student.email'] == submissions['student.email.previous'])][
    ['exercise.name.previous', 'submissions_status.previous', 'time_dist',
     'switch_exercise', 'submissions_status', 'student.email.previous',
     'student.email', 'exercise.name', 'datetime','datetime_shifted']].iloc[1:]

# Calculate In session dropout

In [None]:
threshold = 565
submissions[(submissions['switch_exercise'] == False) 
            & ((submissions['submissions_status.previous'] == 'failed') | (submissions['submissions_status.previous'] == 'errored'))
            & (submissions['time_dist'] > threshold)
            & (submissions['student.email'] == submissions['student.email.previous'])
          ][['exercise.name.previous', 'submissions_status.previous', 'time_dist',
     'switch_exercise', 'submissions_status', 'student.email.previous',
     'student.email', 'exercise.name', 'datetime']].iloc[1:]

## Anoto el dataset
En la columna dropout_switch se anota con True los abandonos por cambio de ejercicio
En la columna dropout_session se anota con True los abandonos por session

In [None]:
submissions['dropout_switch'] = submissions.apply(
    lambda x: True if (
        (x['switch_exercise']) &
        ((x['submissions_status.previous'] == 'failed') | (x['submissions_status.previous'] == 'errored')) & 
        (x['student.email'] == x['student.email.previous'])
    ) else False, axis=1)

In [None]:
submissions['dropout_session'] = submissions.apply(
    lambda x: True if(
        (x['switch_exercise'] == False)
        & ((x['submissions_status.previous'] == 'failed') | (x['submissions_status.previous'] == 'errored'))
        & (x['time_dist'] > 565)
        & (x['student.email'] == x['student.email.previous'])
    ) else False, axis=1)

# Anoto el dataset de la segunda Forma

In [None]:
df = submissions

In [None]:
df['dropout_session_tray'] = False
df['dropout_switch_tray'] = False
df = df.sort_values(['datetime_shifted', 'student.email.previous'], ascending=False)

for student in df['student.email.previous'].unique():
    for exercise_name in df[(df['student.email.previous'] == student) & (df['dropout_switch'] | df['dropout_session'])]['exercise.name.previous'].unique():
    #for exercise_name in df[(df['student.email.previous'] == student) & ((df['dropout_session']) | (df['dropout_switch']))]['exercise.name.previous'].unique()[:1]:
        exer_df = df[(df['exercise.name.previous'] == exercise_name) & (df['student.email.previous'] == student)]
        flag_session = False
        flag_switch = False
        for index, row in df[(df['exercise.name.previous'] == exercise_name) & (df['student.email.previous'] == student)].iterrows():
            if (flag_session and not(row['dropout_switch'])):
                #exer_df['dropout_session_tray'].iloc[index] = True
                df.at[index, 'dropout_session_tray'] = True
            if((row['dropout_session'] or flag_session) and not(row['dropout_switch'])):
                flag_session = True
                #exer_df['dropout_session_tray'].iloc[index] = True
                df.at[index, 'dropout_session_tray'] = True
            else:
                flag_session = False

            if (flag_switch and not(row['dropout_session'])):
                #exer_df['dropout_switch_tray'].iloc[index] = True
                df.at[index, 'dropout_switch_tray'] = True
            if ((row['dropout_switch'] or flag_switch) and not(row['dropout_session'])):
                flag_switch = True
                #exer_df['dropout_switch_tray'].iloc[index] = True
                df.at[index, 'dropout_switch_tray'] = True
            else:
                flag_switch = False

In [None]:
cant_soluciones = df.shape[0]
cant_switch = df[df['dropout_switch']].shape[0]
cant_session = df[df['dropout_session']].shape[0]

cant_switch_tray = df[df['dropout_switch_tray']].shape[0]
cant_session_tray = df[df['dropout_session_tray']].shape[0]

print("Cantidad submissions {}".format(cant_soluciones))

print("1era forma de anotar (solo la ultima submissions)")
print("Cantidad de dropout session {}. Proporcion respecto total submissions {:.3f}".format(cant_session, cant_session/cant_soluciones))
print("Cantidad de dropout switchs {}. Proporcion respecto total submissions {:.3f} ".format(cant_switch, cant_switch/cant_soluciones))
print("Considerando ambos tipos como el mismo cantidad {}  proporcion {:.3f}".format(cant_session + cant_switch,(cant_session + cant_switch)/cant_soluciones))

print("\n2da forma de anotar (anoto trayectoria)")
print('Cantidad de dropout session {}. Proporcion respecto total submissions {:.3f}'.format(cant_session_tray, cant_session_tray/cant_soluciones))
print('Cantidad de dropout switch {}. Proporcion respecto total submissions {:.3f}'.format(cant_switch_tray, cant_switch_tray/cant_soluciones))
print("Considerando ambos tipos como el mismo cantidad {}  proporcion {:.3f}".format(cant_session_tray + cant_switch_tray,(cant_session_tray + cant_switch_tray)/cant_soluciones))

In [None]:
submissions_df = df[['shifted', 'guide.name.previous', 'datetime_shifted', 'student.email.previous',
    'distance', 'time_dist', 'exercise.eid.previous', 'exercise.name.previous',
    'submissions_status.previous', 'submissions_count.previous', 'switch_exercise',
    'dropout_switch', 'dropout_session', 'dropout_session_tray', 'dropout_switch_tray']]
submissions_df = submissions_df.rename(columns={'shifted':'content', 'guide.name.previous':'guide.name', 'datetime_shifted':'datetime', 'student.email.previous': 'student.email',
    'exercise.eid.previous' :'exercise.eid', 'exercise.name.previous': 'exercise.name',
    'submissions_status.previous':'submissions_status', 'submissions_count.previous':'submissions_count'})

In [None]:
submissions_df.to_pickle('mumuki_io_final_anotado.pkl')

In [None]:
submissions_df = submissions_df[~(submissions_df['exercise.name'] == '')]

## merge dropout columns 

In [None]:
submissions_df['dropout'] = False
submissions_df['dropout'] = submissions_df.apply(lambda x: True if (x['dropout_switch'] | x['dropout_session']) else False, axis=1)

submissions_df['dropout_tray'] = False
submissions_df['dropout_tray'] = submissions_df.apply(lambda x: True if (x['dropout_switch_tray'] | x['dropout_session_tray']) else False, axis=1)

# BaseLine Linear Regression

### Train dev test split

In [None]:
base = 'datasets/mumuki-io/'
def train_dev_test(df, proportion=[.8,.9]):
    train_df = pd.DataFrame() 
    dev_df = pd.DataFrame() 
    test_df = pd.DataFrame() 
    exercises_names = df['exercise.name'].unique()
    for exercise in exercises_names:
        df_exer = df[df['exercise.name'] == exercise]
        train, dev, test = np.split(df_exer.sample(frac=1), [int(.8*len(df_exer)), int(.9*len(df_exer))])
        print(train.shape, dev.shape, test.shape, exercise)
        train_df = train_df.append(train, ignore_index=True)
        dev_df = dev_df.append(dev, ignore_index=True)
        test_df = test_df.append(test, ignore_index=True)
    return train_df, dev_df, test_df

In [None]:
train, dev, test = train_dev_test(submissions_df)

In [None]:
train.to_pickle(base+'train-io-801010.pkl')
dev.to_pickle(base+'dev-io-801010.pkl')
test.to_pickle(base+'test-io-801010.pkl')

# load dataset from pickle files

In [None]:
base = 'datasets/mumuki-io/'
train_df = pd.read_pickle(base+'train-io-801010.pkl')
dev_df = pd.read_pickle(base+'dev-io-801010.pkl')
test_df = pd.read_pickle(base+'test-io-801010.pkl')

In [None]:
features = ['submissions_count', 'exercise.eid']
target = 'dropout_session_tray'
X_train = train_df[features]
Y_train = train_df[target]

In [None]:
mapper = DataFrameMapper([
     (['submissions_count'], None),
     (['exercise.eid'], None)
 ])


In [None]:
mapper.fit_transform(X_train)

In [None]:
pipe2 = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(C=.5,penalty='l2', solver='saga',tol=1e-6,
                         max_iter=int(1e6), warm_start=True, n_jobs=-1))
])

In [None]:
pipe2.fit(X_train, Y_train)

## Test model in dev dataset

In [None]:
X_dev = dev_df[features]
Y_dev = dev_df[target]

In [None]:
print(classification_report(pipe2.predict(X_dev), Y_dev))

In [None]:
submissions_df[submissions_df['dropout_tray']].shape[0]/submissions_df.shape[0]

In [None]:
submissions_df[submissions_df['dropout']]