In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
import names

In [2]:
## Build Train Dataset #

# number students
school_size = 600

# features for ML
features = ['num_absent','num_tardy','elapsed_tardy','absent_freq','tardy_freq','chronic_absentee']

# generate names
names_list = []
for idx in range(0,(school_size//2)+2):
    names_list.append(names.get_full_name(gender='male'))
    names_list.append(names.get_full_name(gender='female'))

data = 0
df = pd.DataFrame(data, index = names_list, columns = features)

df['num_absent'] = np.random.randint(0,30,size=(len(df),1))
df['num_tardy'] = np.random.randint(0,25,size=(len(df),1))
df['elapsed_tardy'] = np.random.randint(0,60,size=(len(df),1))
df['absent_freq'] = np.random.randint(0,5,size=(len(df),1))
df['tardy_freq'] = np.random.randint(0,5,size=(len(df),1))
df = df.sort_values('num_absent')

# generate absentees among students
num_absentees = school_size*.15
num_regulars = school_size - num_absentees
df.loc[int(num_regulars):,'chronic_absentee'] = 1

# shuffle rows
df_train = df.sample(frac=1)

df_train.head(-5)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_freq,tardy_freq,chronic_absentee
Robert Teran,5,1,35,4,2,0
Sandra Ulrich,8,22,32,4,3,0
Valerie Burrow,9,1,34,0,0,0
Edwin Madison,14,24,23,2,0,0
Barbara Hanavan,2,21,30,2,2,0
...,...,...,...,...,...,...
Justin Johnson,12,16,0,1,3,0
Casey Aguilar,14,6,37,1,0,0
Jody Chaffee,3,21,4,4,4,0
Virginia Spearman,24,14,26,1,3,0


In [3]:
## Build Test Dataset #

# number students
school_size = 450

# features for ML
features = ['num_absent','num_tardy','elapsed_tardy','absent_freq','tardy_freq','chronic_absentee_risk']

# generate names
names_list = []
for idx in range(0,(school_size//2)+2):
    names_list.append(names.get_full_name(gender='male'))
    names_list.append(names.get_full_name(gender='female'))

data = 0
df = pd.DataFrame(data, index = names_list, columns = features)

df['num_absent'] = np.random.randint(0,30,size=(len(df),1))
df['num_tardy'] = np.random.randint(0,25,size=(len(df),1))
df['elapsed_tardy'] = np.random.randint(0,60,size=(len(df),1))
df['absent_freq'] = np.random.randint(0,5,size=(len(df),1))
df['tardy_freq'] = np.random.randint(0,5,size=(len(df),1))

# set all to unkown
df.loc[:-1,'chronic_absentee_risk'] = 'unkwn'

# shuffle rows
df_test = df.sample(frac=1)

df_test.head(-5)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_freq,tardy_freq,chronic_absentee_risk
James Lavoie,28,19,44,0,2,unkwn
Kelly Brown,8,19,53,3,2,unkwn
Zachary Jackson,16,11,19,4,1,unkwn
Anne Merle,9,16,2,1,4,unkwn
Janice Miller,17,15,12,1,0,unkwn
...,...,...,...,...,...,...
Mary Barney,7,7,5,3,4,unkwn
Shirley Taylor,15,4,46,2,4,unkwn
Theresa Goggins,14,4,4,1,4,unkwn
Margaret Johnson,18,5,5,0,4,unkwn


In [4]:
## Linear Regression #
from sklearn.linear_model import LinearRegression

X_train = df_train[features[:-1]]

# labels
y_train = df_train['chronic_absentee']

# generate and fit model to historic data
model = LinearRegression(normalize=True)
model.fit(X=X_train, y=y_train)

# predict
X_test = df_test[features[:-1]]
pred = model.predict(X=X_test)

# update daaram w predictions
df_test['chronic_absentee_risk'] = pred

# normalize
df_test['chronic_absentee_risk']=(df_test['chronic_absentee_risk']-\
                             df_test['chronic_absentee_risk'] .mean())/df_test['chronic_absentee_risk'] .std()

df_test = df_test.sort_values('chronic_absentee_risk')

df_test.head(-5)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_freq,tardy_freq,chronic_absentee_risk
Jennifer Butler,0,16,2,0,3,-1.920270
Charles Maltba,0,24,11,2,1,-1.830724
Jacalyn Lord,0,5,3,1,4,-1.817031
Rodney Suiter,1,17,6,0,3,-1.797361
Tawana Harvey,0,19,20,2,2,-1.783903
...,...,...,...,...,...,...
Earl Blount,29,24,28,3,0,1.656644
Robert James,29,9,8,3,3,1.661993
William Forbes,28,18,51,4,4,1.662711
Mary Theiler,29,4,37,1,3,1.677076


In [5]:
# identify top 5 students at risk for absenteeism in test data

top_5 = df_test.nlargest(5, ['chronic_absentee_risk']).head()

top_5.head(5)

Unnamed: 0,num_absent,num_tardy,elapsed_tardy,absent_freq,tardy_freq,chronic_absentee_risk
Rhonda Lopez,29,7,43,4,0,1.83777
Frank Avery,29,0,52,1,0,1.758552
Henry Johnson,29,7,2,4,1,1.714221
John Reber,29,8,55,1,3,1.707517
Rafael Homes,29,3,24,2,1,1.703855
