In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
import names

In [2]:
## Build Train Dataset #

# school params
school_size = 600
school_days_past = 180 # full school year

# features for ML
features = ['num_absent','num_tardy','elapsed_tardy','absent_ratio','tardy_ratio','chronic_absentee']

# generate names
names_list = []
for idx in range(school_size//2):
    names_list.append(names.get_full_name(gender='male'))
    names_list.append(names.get_full_name(gender='female'))

data = 0
df = pd.DataFrame(data, index = names_list, columns = features)

# simulation data
df['num_absent'] = np.random.randint(0,school_days_past//3,size=(len(df),1))
df['num_tardy'] = np.random.randint(0,school_days_past//4,size=(len(df),1))
df['absent_ratio'] = df['num_absent']/school_days_past
df['tardy_ratio'] = df['num_tardy']/school_days_past
df['elapsed_tardy'] = np.random.randint(0,10,size=(len(df),1))
df = df.sort_values('num_absent')

# generate absentees among students
num_absentees = school_size*.15
num_regulars = school_size - num_absentees
df.loc[int(num_regulars):,'chronic_absentee'] = 1

# shuffle rows
df_train = df.sample(frac=1)

df_train.head(-1)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_ratio,tardy_ratio,chronic_absentee
Jose Davis,10,3,5,0.055556,0.016667,0
Nancy Brunetti,25,26,8,0.138889,0.144444,0
Keith Akin,40,36,7,0.222222,0.200000,0
James Jackson,53,42,5,0.294444,0.233333,1
Emma Small,2,18,9,0.011111,0.100000,0
...,...,...,...,...,...,...
Tracey Adams,58,8,4,0.322222,0.044444,1
Diane Almonte,58,40,8,0.322222,0.222222,1
Michelle Shawe,40,8,9,0.222222,0.044444,0
Helga Holland,42,2,7,0.233333,0.011111,0


We've now created our training set. This set is representative of collected student data from previous school year where in retrospect chronic absentees are easily identified. A '1' means the student was flagged for chronic absenteeism and '0' means student was not considered chronically absent. We marked 15% of the most often absent students as chronic absentees.

In [3]:
## Build Test Dataset #

# school params
school_size = 450
school_days_past = 30

# features for ML
features = ['num_absent','num_tardy','elapsed_tardy','absent_ratio','tardy_ratio','chronic_absentee_risk']

# generate names
names_list = []
for idx in range(school_size//2):
    names_list.append(names.get_full_name(gender='male'))
    names_list.append(names.get_full_name(gender='female'))

data = 0
df = pd.DataFrame(data, index = names_list, columns = features)

# simulation data
df['num_absent'] = np.random.randint(0,school_days_past//3,size=(len(df),1))
df['num_tardy'] = np.random.randint(0,school_days_past//4,size=(len(df),1))
df['absent_ratio'] = df['num_absent']/school_days_past
df['tardy_ratio'] = df['num_tardy']/school_days_past
df['elapsed_tardy'] = np.random.randint(0,10,size=(len(df),1))

# set all to unkown
df.loc[:,'chronic_absentee_risk'] = 'unkwn'

# shuffle rows
df_test = df.sample(frac=1)

df_test.head(-1)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_ratio,tardy_ratio,chronic_absentee_risk
Jamie Krylo,3,3,0,0.100000,0.100000,unkwn
Vicky Bonilla,4,3,2,0.133333,0.100000,unkwn
Anthony Mcmanemy,0,2,6,0.000000,0.066667,unkwn
Mary Tucker,7,2,1,0.233333,0.066667,unkwn
Sergio Carrillo,8,0,3,0.266667,0.000000,unkwn
...,...,...,...,...,...,...
Talia Mclean,7,1,7,0.233333,0.033333,unkwn
Richard Wise,1,6,6,0.033333,0.200000,unkwn
Hermine Hensley,4,5,9,0.133333,0.166667,unkwn
Olivia Cutshall,1,6,2,0.033333,0.200000,unkwn


We now have our testing set for the ML algo to predict on. Here, we do not yet know which students are most at risk to chronic absenteeism. This set is representative of what data would be available after the first 30 days of school. The device tracks this data throughout the year and makes new predictions each day as new attendance data is gathered.

In [4]:
## Linear Regression # Machine Learning Algorithm
from sklearn.linear_model import LinearRegression

# train set
X_train = df_train[features[:-1]]

# labels
y_train = df_train['chronic_absentee']

# generate and fit model to historic data
model = LinearRegression(normalize=True)
model.fit(X=X_train, y=y_train)

# predict
X_test = df_test[features[:-1]]
pred = model.predict(X=X_test)

# make preds between 0 and 1
pred=(pred-pred.min())/(pred.max()-pred.min())

# update daaram w predictions
df_test['chronic_absentee_risk'] = pred

# sort by risk, closer to 1 the higher the risk
df_test = df_test.sort_values('chronic_absentee_risk')

df_test.head(-1)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_ratio,tardy_ratio,chronic_absentee_risk
Calvin Bean,0,0,0,0.0,0.000000,0.000000
Margaret Doyle,0,1,0,0.0,0.033333,0.000437
David Rodgers,0,2,0,0.0,0.066667,0.000873
Douglas Moore,0,2,0,0.0,0.066667,0.000873
John Peterson,0,3,0,0.0,0.100000,0.001310
...,...,...,...,...,...,...
Suzanne Choe,9,0,8,0.3,0.000000,0.994556
Ramon Wolfe,9,3,8,0.3,0.100000,0.995866
Chung Millan,9,4,8,0.3,0.133333,0.996303
Roberto Burr,9,0,9,0.3,0.000000,0.997817


Above, after the ML algo makes its predictions, all students are ordered by risk. Note 'chronic_absentee_risk' column for predicted risk level, higher means more risk with values between 0 and 1.

In [5]:
# identify top 5 students at risk for absenteeism in test data

top_5 = df_test.nlargest(5, ['chronic_absentee_risk']).head()

top_5.head(5)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_ratio,tardy_ratio,chronic_absentee_risk
Martha Pak,9,5,9,0.3,0.166667,1.0
Minnie Puentes,9,2,9,0.3,0.066667,0.99869
Roberto Burr,9,0,9,0.3,0.0,0.997817
Chung Millan,9,4,8,0.3,0.133333,0.996303
Ramon Wolfe,9,3,8,0.3,0.1,0.995866


These are the top 5 students predicted by the ML algorithm to be at the highest risk for absenteeism. We can see from the values associated with these students that indeed they have missed many classes within the first 30 days of school, which is indicative of chronic absenteeism. 

With more robust data gathered by the device, we can discover more interestingly correlated features beyond the total number of absences. Some other features we could use are tardy frequency and absent frequency. The groupings of these occurrences could be positively correlated with chronic absenteeism and make detection even earlier. 

This is simulated data and the highest absences were assigned the chronic absenteeism label. So, these results demonstrate that the model is performing well and as expected.

More data is needed of which the Vigil system is capable to gather and process in a real school environment. By identifying students most at risk, the school can take action to improve the student's chance for academic success as well as positive life outcomes.