<a name="Inicio"></a>
# Data Analysis

#### Autor: *Ángel Pérez Lemonche*


## Descripción general

Este script utiliza las matrices de transición generadas en "Learning Paths with General *bricks*" con el fin de analizar el contenido de los datos para tomar futuras decisiones. Se puede realizar el mismo proceso utilizando **vectores de frecuencias**, normalizando por el número total de transiciones por semanas.

### Importación de librerías y declaración de funciones

In [1]:
# Librerías generales
import numpy as np
import pandas as pd
import time

import sys
sys.path.insert(0, './lib/')

import imp
import myfunclib as mf
import auxiliares as ax
#import transiciones as tr
import myValFunctions as vf
import myClusteringFunctions as cf


# Librerías propias
mf = imp.reload(mf)
ax = imp.reload(ax)
#tr = imp.reload(tr)

vf = imp.reload(vf)
cf = imp.reload(cf)


pd.set_option('display.max_columns', 30)
from IPython.core.display import display, HTML, clear_output

# Librerías ML
from sklearn.cluster import KMeans

# Gráficos
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import style
#style.use("ggplot")
plt.rcParams["figure.figsize"] = [5,5]

ModuleNotFoundError: No module named 'auxiliares'

Cargamos el fichero de entrada

In [2]:
# Diccionario semana

from datetime import datetime

### EDICION 1 ##

sem1 = datetime(2015,2,24)
sem2 = datetime(2015,3,3)
sem3 = datetime(2015,3,10)
sem4 = datetime(2015,3,17)
sem5 = datetime(2015,3,24)
sem6 = datetime(2015,3,31)
sem7 = datetime(2015,4,7)
sem8 = datetime(2015,4,14)

"""
###EDICION 2 ##

sem1 = datetime(2015,10,5)
sem2 = datetime(2015,10,12)
sem3 = datetime(2015,10,19)
sem4 = datetime(2015,10,26)
sem5 = datetime(2015,11,2)
sem6 = datetime(2015,11,9)
sem7 = datetime(2015,11,16)
sem8 = datetime(2015,11,23)
"""

SEM = {"Week1": [sem1, sem2],
       "Week2": [sem2, sem3],
       "Week3": [sem3, sem4],
       "Week4": [sem4, sem5],
       "Week5": [sem5, sem6],
       "Week6": [sem6, sem7]}

In [None]:
# Cargamos el fichero de trabajo
filename = "files/eventos_final.json"

eventsDFRaw = ax.readDataFile(filename)
eventsDFRawT = eventsDFRaw[eventsDFRaw['Usuario'] != '']
print("Tamaño:",len(eventsDFRaw))
print(eventsDFRaw.head(10))


In [None]:
eventsDFRaw = eventsDFRawT

In [None]:
# Filtrado por usuario: eliminamos los que hayan realizado
#  menos de 50 eventos en el curso
minEvents = 50

eventsDF = ax.dataFiltering(eventsDFRaw, minEvents);
print("Tamaño:",len(eventsDF))   
print(eventsDF.head(10))

In [None]:
# Transiciones x semana
dicEvents = {
    'V' : ['play_video', 'seek_video'],
    'N' : ['problem_check'],
    'F' : ['edx.forum.searched', 'edx.forum.comment.created', 
           'edx.forum.response.created', 'edx.forum.thread.created'],
    'P' : ['openassessmentblock.self_assess'],
    'D' : ['textbook.pdf.chapter.navigated']
}

param = {'dicEvents': dicEvents,
        'assignationProblemCheckFile': 'IDNaturalProblemasEd1.csv',
        'ProblemCheckExceptions': ['J','X'],
        'useFreq': True}


startTotal = time.time()
transitionsDF = tr.Transitions(eventsDF, param)
print("%.2f"%((time.time() - startTotal)/60), 'minutos.')

In [None]:
transitionsDF.set_index('Usuario', inplace = True)

In [None]:
#transitionsDF.to_csv('Transitionsdf.csv')

Reducción de dimensionalidad

In [None]:
# Selección de características

def MinMaxNorm(series):
    minim = series.min()
    maxim = series.max()
    return (series-minim)/(maxim-minim)

def TotalNorm(series):
    return series/series.sum()

def ZNorm(series):
    mean = series.mean()
    std = series.std()
    return (series-mean)/std

def featFrequency(featuresDF, info = .95, Normalize = None, Use_Rest = True, verbose = 1):
    import pandas as pd
    featDF = featuresDF.copy()
    returnDF = featuresDF.copy()

    if Normalize != None:
        featDF = featDF.apply(lambda x: Normalize(x), axis=0)
    
    suma = featDF.sum(axis=0)
    perinfo = (abs(suma)/abs(suma).sum()).sort_values(ascending=False)
    cumsum = perinfo.cumsum()

    featInfo = pd.concat([perinfo, cumsum], axis = 1)
    featInfo.columns = ['percentage', 'cumsum']
    featInfo['selection'] = (featInfo['cumsum'] < info)

    returnDF = returnDF[featInfo.index[featInfo.selection]]

    if Use_Rest:
        returnDF['Rest'] = featuresDF[featInfo.index[featInfo.selection == False]].sum(axis=1)
    
    if verbose > 0:
        print('From',len(featuresDF.columns),'to',len(returnDF.columns), 'selected features.')
        if verbose == 2:
            pd.options.display.float_format = '{:,.2f}%'.format
            toPrint = featInfo[['percentage','cumsum']].apply(lambda x: x*100)
            toPrint['selection'] = featInfo.selection
            display(toPrint)
            pd.options.display.float_format = '{:,.4f}'.format
    
    return returnDF, list(featInfo.index[featInfo.selection])

In [None]:
featuresDF, _ = featFrequency(transitionsDF, info = .966, verbose = 2)

### Validación

In [None]:
nClusters = 10
seed = 206
nRepetitions = 1000
computeClusterValidation = True
plt.rcParams["figure.figsize"] = [5,3]

In [None]:
if computeClusterValidation:
    av, st = vf.interClusterDistKMeans(featuresDF, nClusters = nClusters, nRandomStates = nRepetitions, 
                                    seed = seed, plot = True)

In [None]:
if computeClusterValidation:
    av, st = vf.BICCriterionKMeans(featuresDF, nClusters = nClusters, nRandomStates = nRepetitions, 
                                    seed = seed, plot = True)

In [None]:
if computeClusterValidation:
    av, st = vf.closestCentroidDistKMeans(featuresDF, nClusters = nClusters, nInit = nRepetitions, 
                                    seed = seed, plot = True)

In [None]:
if computeClusterValidation:
    plt.figure(figsize=(6,9))
    av, st = vf.silhouetteKMeans(featuresDF, nClusters = nClusters, nInit = nRepetitions, 
                                    seed = seed, plot = True)

In [None]:
# EVALUATE THE PARAMETERS OF THE MODEL

from sklearn.cluster import KMeans
model = KMeans()

param_grid = {
    'n_clusters' : [6],
    'algorithm' : ['auto', 'full'],
    'init' : ['k-means++', 'random'],
    'n_init' : [20],
    'max_iter' : [200, 300],
    'random_state' : [None]
}

ParametersEvaluation = False

if ParametersEvaluation:
    nkFolds = 50
    scores, parameters, _ = vf.DeepCrossValidationClustering(featuresDF, 'KMeans', model, param_grid, kfold=nkFolds)

    print('\nEVALUATION\n')

    vf.evaluateResults(scores, parameters)

In [None]:
nInitializations = 1000
nPartitions = 0
test_size = .20

model.set_params(n_clusters = 6, algorithm = 'auto', init = 'random', max_iter = 300, n_init = 50)

In [None]:
model = model.set_params(n_clusters = 6)

model, bestSeed, results = vf.InitializationSeed(featuresDF, model, test_size = test_size, seed = seed,
                                nRandomStates = nInitializations, CrossVal = nPartitions)
print('Seed:', bestSeed)

In [None]:
model = vf.CreatePartitionAndFitSelectedModel(featuresDF, model, test_size, bestSeed[0], bestSeed[1])
labels = model.predict(featuresDF)
centers = model.cluster_centers_

In [None]:
cf.plotCentroids(centers, labels, featuresDF.columns, sort = [], avg = False, matrix = True, textThreshold = -.01)

In [None]:
alpha = 10**-1

In [None]:
DP, DC = vf.checkClusterPositions(featuresDF, model, test_size, results, alpha)
_ =[print("%d%%"%(x[0]*100),x[1],x[2],'\n') for x in DP if (x[0]*100)>2]

DC

In [None]:
plt.rcParams["figure.figsize"] = [8,8]
cf.plotVectors(centers, dimensionNames = list(featuresDF.columns), colors = [], Nlevels = 10,
                logF = False, inv = True, minim = 0, verbose = True)

In [None]:
pd.options.display.float_format = '{:,.4f}'.format

mf.showTable(pd.DataFrame(list(featuresDF.index),columns = ['users']), pd.DataFrame(labels, columns = ['labels']))