In [1]:
# Imports
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
job = pd.read_csv("https://raw.githubusercontent.com/DG1606/CMS-R-2020/master/Placement_Data_Full_Class.csv")
job.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Problem to solve: Want to develop a model that can predict the likelihood of a student being placed in a job based on their level of education. 

Individual Business Metric: Assuming that a higher level of education leads to a higher likelihood of job placement, can we predict whether students with different educational backgrounds will secure a job?

In [3]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [4]:
col_to_cat = [1,3,5, 6, 8, 9, 11, 13]
job.iloc[:,col_to_cat]= job.iloc[:,col_to_cat].astype('category') 

job.dtypes

  job.iloc[:,col_to_cat]= job.iloc[:,col_to_cat].astype('category')


sl_no                int64
gender            category
ssc_p              float64
ssc_b             category
hsc_p              float64
hsc_b             category
hsc_s             category
degree_p           float64
degree_t          category
workex            category
etest_p            float64
specialisation    category
mba_p              float64
status            category
salary             float64
dtype: object

In [5]:
continuous_vars = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']

scaler = MinMaxScaler()

job[continuous_vars] = scaler.fit_transform(job[continuous_vars])

job.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,0.53824,Others,0.889621,Others,Commerce,0.195122,Sci&Tech,No,0.104167,Mkt&HR,0.284483,Placed,0.094595
1,2,M,0.792414,Central,0.68089,Others,Science,0.670244,Sci&Tech,Yes,0.760417,Mkt&Fin,0.564843,Placed,0.0
2,3,M,0.497011,Central,0.510708,Central,Arts,0.341463,Comm&Mgmt,No,0.520833,Mkt&Fin,0.247001,Placed,0.067568
3,4,M,0.311482,Central,0.247117,Central,Science,0.04878,Sci&Tech,No,0.333333,Mkt&HR,0.308096,Not Placed,
4,5,M,0.925788,Central,0.602965,Central,Commerce,0.568293,Comm&Mgmt,No,0.975,Mkt&Fin,0.160795,Placed,0.304054


In [6]:
abc = list(job.select_dtypes('number')) 

job[abc] = MinMaxScaler().fit_transform(job[abc])
job 

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,0.000000,M,0.538240,Others,0.889621,Others,Commerce,0.195122,Sci&Tech,No,0.104167,Mkt&HR,0.284483,Placed,0.094595
1,0.004673,M,0.792414,Central,0.680890,Others,Science,0.670244,Sci&Tech,Yes,0.760417,Mkt&Fin,0.564843,Placed,0.000000
2,0.009346,M,0.497011,Central,0.510708,Central,Arts,0.341463,Comm&Mgmt,No,0.520833,Mkt&Fin,0.247001,Placed,0.067568
3,0.014019,M,0.311482,Central,0.247117,Central,Science,0.048780,Sci&Tech,No,0.333333,Mkt&HR,0.308096,Not Placed,
4,0.018692,M,0.925788,Central,0.602965,Central,Commerce,0.568293,Comm&Mgmt,No,0.975000,Mkt&Fin,0.160795,Placed,0.304054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0.981308,M,0.818594,Others,0.741351,Others,Commerce,0.673171,Comm&Mgmt,No,0.854167,Mkt&Fin,0.872564,Placed,0.270270
211,0.985981,M,0.352711,Others,0.378913,Others,Science,0.536585,Sci&Tech,No,0.500000,Mkt&Fin,0.090330,Placed,0.101351
212,0.990654,M,0.538240,Others,0.494234,Others,Commerce,0.560976,Comm&Mgmt,Yes,0.187500,Mkt&Fin,0.693778,Placed,0.128378
213,0.995327,F,0.682540,Others,0.477759,Others,Commerce,0.195122,Comm&Mgmt,No,0.416667,Mkt&HR,0.338081,Placed,0.005405


In [7]:
category_list = list(job.select_dtypes('category')) 

job_1h = pd.get_dummies(job, columns = category_list) 

job_1h

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_F,gender_M,ssc_b_Central,...,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed
0,0.000000,0.538240,0.889621,0.195122,0.104167,0.284483,0.094595,0,1,0,...,0,0,0,1,1,0,0,1,0,1
1,0.004673,0.792414,0.680890,0.670244,0.760417,0.564843,0.000000,0,1,1,...,1,0,0,1,0,1,1,0,0,1
2,0.009346,0.497011,0.510708,0.341463,0.520833,0.247001,0.067568,0,1,1,...,0,1,0,0,1,0,1,0,0,1
3,0.014019,0.311482,0.247117,0.048780,0.333333,0.308096,,0,1,1,...,1,0,0,1,1,0,0,1,1,0
4,0.018692,0.925788,0.602965,0.568293,0.975000,0.160795,0.304054,0,1,1,...,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0.981308,0.818594,0.741351,0.673171,0.854167,0.872564,0.270270,0,1,0,...,0,1,0,0,1,0,1,0,0,1
211,0.985981,0.352711,0.378913,0.536585,0.500000,0.090330,0.101351,0,1,0,...,1,0,0,1,1,0,1,0,0,1
212,0.990654,0.538240,0.494234,0.560976,0.187500,0.693778,0.128378,0,1,0,...,0,1,0,0,0,1,1,0,0,1
213,0.995327,0.682540,0.477759,0.195122,0.416667,0.338081,0.005405,1,0,0,...,0,1,0,0,1,0,0,1,0,1


In [8]:
job_dt = job_1h.drop(['sl_no'],axis=1) 
job_dt

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_F,gender_M,ssc_b_Central,ssc_b_Others,...,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed
0,0.538240,0.889621,0.195122,0.104167,0.284483,0.094595,0,1,0,1,...,0,0,0,1,1,0,0,1,0,1
1,0.792414,0.680890,0.670244,0.760417,0.564843,0.000000,0,1,1,0,...,1,0,0,1,0,1,1,0,0,1
2,0.497011,0.510708,0.341463,0.520833,0.247001,0.067568,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
3,0.311482,0.247117,0.048780,0.333333,0.308096,,0,1,1,0,...,1,0,0,1,1,0,0,1,1,0
4,0.925788,0.602965,0.568293,0.975000,0.160795,0.304054,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0.818594,0.741351,0.673171,0.854167,0.872564,0.270270,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
211,0.352711,0.378913,0.536585,0.500000,0.090330,0.101351,0,1,0,1,...,1,0,0,1,1,0,1,0,0,1
212,0.538240,0.494234,0.560976,0.187500,0.693778,0.128378,0,1,0,1,...,0,1,0,0,0,1,1,0,0,1
213,0.682540,0.477759,0.195122,0.416667,0.338081,0.005405,1,0,0,1,...,0,1,0,0,1,0,0,1,0,1


In [10]:
prevalence = job_1h.status_Placed.value_counts()[1]/len(job_1h.status_Placed)
prevalence

0.6883720930232559

In [14]:
print(job_1h.status_Placed.value_counts())
print(148/(67+148)) 

1    148
0     67
Name: status_Placed, dtype: int64
0.6883720930232559


In [56]:
X = job_1h.drop(columns=['status_Placed', 'status_Not Placed']) 
y = job_1h['status_Placed']  

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)

X_tune, X_test, y_tune, y_test = train_test_split(X_test, y_test, train_size=0.5, stratify=y_test)

print(f"Train set: {X_train.shape[0]} rows")
print(f"Tune set: {X_tune.shape[0]} rows")
print(f"Test set: {X_test.shape[0]} rows")

print(f"Train set prevalence: {y_train.mean()}")
print(f"Tune set prevalence: {y_tune.mean()}")
print(f"Test set prevalence: {y_test.mean()}")

Train set: 172 rows
Tune set: 21 rows
Test set: 22 rows
Train set prevalence: 0.686046511627907
Tune set prevalence: 0.7142857142857143
Test set prevalence: 0.6818181818181818


Step three: What do your instincts tell you about the data. Can it address your problem, what areas/items are you worried about? 

My instincts tell me that the data likely reflects a positive correlation between education level and job placement success, with the percentage of job placement success increasing as the the education level increases. However, in terms of predictive power, I am worried that the dataset is too small to be able to accurately predict job placement based on education level, and may lead to over-fitting. There is also a possibility of bias being introduced because the degree categories are vague and not fully representative, the model might learn patterns specific to the industries represented, which will fail to generalize well to students from different fields. 

In [19]:
from io import StringIO
import requests

url="https://query.data.world/s/ttvvwduzk3hwuahxgxe54jgfyjaiul"
s=requests.get(url).text
c=pd.read_csv(StringIO(s))
c.head()

Unnamed: 0,unitid,chronname,city,state,level,control,basic,hbcu,flagship,long_x,...,vsa_grad_after6_transfer,vsa_grad_elsewhere_after6_transfer,vsa_enroll_after6_transfer,vsa_enroll_elsewhere_after6_transfer,similar,state_sector_ct,carnegie_ct,counted_pct,nicknames,cohort_size
0,100654,Alabama A&M University,Normal,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,-86.568502,...,36.4,5.6,17.2,11.1,232937|100724|405997|113607|139533|144005|2285...,13,386,99.7|07,,882.0
1,100663,University of Alabama at Birmingham,Birmingham,Alabama,4-year,Public,Research Universities--very high research acti...,,,-86.80917,...,,,,,196060|180461|201885|145600|209542|236939|1268...,13,106,56.0|07,UAB,1376.0
2,100690,Amridge University,Montgomery,Alabama,4-year,Private not-for-profit,Baccalaureate Colleges--Arts & Sciences,,,-86.17401,...,,,,,217925|441511|205124|247825|197647|221856|1353...,16,252,100.0|07,,3.0
3,100706,University of Alabama at Huntsville,Huntsville,Alabama,4-year,Public,Research Universities--very high research acti...,,,-86.63842,...,0.0,0.0,0.0,0.0,232186|133881|196103|196413|207388|171128|1900...,13,106,43.1|07,UAH,759.0
4,100724,Alabama State University,Montgomery,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,-86.295677,...,,,,,100654|232937|242617|243197|144005|241739|2354...,13,386,88.0|07,ASU,1351.0


Problem to solve: Want to develop a model that can predict whether an institution is public vs. private based on relevant factors.

Independent Business Metric: Assuming that the type of control (public or private) influences the financial success of institutions, can we predict public vs. private institutions based on financial performance?

In [20]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 62 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   unitid                                3798 non-null   int64  
 1   chronname                             3798 non-null   object 
 2   city                                  3798 non-null   object 
 3   state                                 3798 non-null   object 
 4   level                                 3798 non-null   object 
 5   control                               3798 non-null   object 
 6   basic                                 3798 non-null   object 
 7   hbcu                                  94 non-null     object 
 8   flagship                              50 non-null     object 
 9   long_x                                3798 non-null   float64
 10  lat_y                                 3798 non-null   float64
 11  site             

In [33]:
cols_to_cat = [2, 3, 4, 5, 6, 7, 8, 9, 12]

c.iloc[:, cols_to_cat] = c.iloc[:, cols_to_cat].astype('category')

print(c.dtypes)


unitid              float64
chronname          category
city               category
state              category
level              category
                     ...   
state_sector_ct     float64
carnegie_ct         float64
counted_pct          object
nicknames            object
cohort_size         float64
Length: 62, dtype: object


  c.iloc[:, cols_to_cat] = c.iloc[:, cols_to_cat].astype('category')


In [54]:
abc = list(c.select_dtypes('number'))  

c[abc] = MinMaxScaler().fit_transform(c[abc])
c

Unnamed: 0,unitid,chronname,city,state,level,control,basic,hbcu,flagship,long_x,...,vsa_grad_after6_transfer,vsa_grad_elsewhere_after6_transfer,vsa_enroll_after6_transfer,vsa_enroll_elsewhere_after6_transfer,similar,state_sector_ct,carnegie_ct,counted_pct,nicknames,cohort_size
0,0.000000,Alabama A&M University,Normal,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,-86.568502,...,0.407615,0.356098,1.0,0.521127,232937|100724|405997|113607|139533|144005|2285...,0.104348,0.746124,99.7|07,,0.054289
1,0.000024,University of Alabama at Birmingham,Birmingham,Alabama,4-year,Public,Research Universities--very high research acti...,,,-86.809170,...,,,,,196060|180461|201885|145600|209542|236939|1268...,0.104348,0.203488,56.0|07,UAB,0.084730
2,0.000096,Amridge University,Montgomery,Alabama,4-year,Private not-for-profit,Baccalaureate Colleges--Arts & Sciences,,,-86.174010,...,,,,,217925|441511|205124|247825|197647|221856|1353...,0.130435,0.486434,100.0|07,,0.000123
3,0.000139,University of Alabama at Huntsville,Huntsville,Alabama,4-year,Public,Research Universities--very high research acti...,,,-86.638420,...,0.000000,0.219512,0.0,0.000000,232186|133881|196103|196413|207388|171128|1900...,0.104348,0.203488,43.1|07,UAH,0.046709
4,0.000187,Alabama State University,Montgomery,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,-86.295677,...,,,,,100654|232937|242617|243197|144005|241739|2354...,0.104348,0.746124,88.0|07,ASU,0.083190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3793,0.963263,Grace College of Divinity,Fayetteville,North Carolina,4-year,Private not-for-profit,Not applicable- not in Carnegie universe,,,-78.961691,...,,,,,,0.365217,0.155039,,,0.000678
3794,0.965468,John Paul the Great Catholic University,Escondido,California,4-year,Private not-for-profit,Not applicable- not in Carnegie universe,,,-117.082401,...,,,,,,0.634783,0.155039,,,0.001972
3795,0.977658,Chamberlain College of Nursing-Missouri,St. Louis,Missouri,4-year,Private for-profit,Not applicable- not in Carnegie universe,,,-90.436221,...,,,,,,0.165217,0.155039,,,0.000431
3796,0.998906,Minneapolis Media Institute,Edina,Minnesota,2-year,Private for-profit,Not applicable- not in Carnegie universe,,,-93.331491,...,,,,,,0.034783,0.155039,,,0.008874


In [37]:
category_list = list(c.select_dtypes('category')) 

c_1h = pd.get_dummies(c, columns = category_list) 

c_1h

Unnamed: 0,unitid,lat_y,awards_per_state_value,awards_per_natl_value,exp_award_value,exp_award_state_value,exp_award_natl_value,exp_award_percentile,ft_pct,fte_value,...,awards_per_value_75.5,awards_per_value_76.4,awards_per_value_76.8,awards_per_value_77.0,awards_per_value_77.5,awards_per_value_80.4,awards_per_value_90.8,awards_per_value_128.7,awards_per_value_131.1,awards_per_value_137.6
0,0.000000,0.292177,0.275132,0.306748,0.019941,0.359141,0.541284,0.90,0.935551,0.030646,...,0,0,0,0,0,0,0,0,0,0
1,0.000024,0.267361,0.275132,0.306748,0.025851,0.359141,0.541284,0.97,0.716216,0.079120,...,0,0,0,0,0,0,0,0,0,0
2,0.000096,0.245286,0.257496,0.368098,0.011059,0.452754,1.000000,0.30,0.612266,0.002065,...,0,0,0,0,0,0,0,0,0,0
3,0.000139,0.291004,0.275132,0.306748,0.012196,0.359141,0.541284,0.61,0.733888,0.039303,...,0,0,0,0,0,0,0,0,0,0
4,0.000187,0.245319,0.275132,0.306748,0.025067,0.359141,0.541284,0.96,0.906445,0.039580,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3793,0.963263,0.297512,0.320988,0.368098,0.004553,0.770853,1.000000,0.03,0.319127,0.000277,...,0,0,0,0,0,0,0,0,0,0
3794,0.965468,0.259974,0.338624,0.368098,0.020061,0.643289,1.000000,0.72,0.917879,0.000870,...,0,0,0,0,0,0,0,0,0,0
3795,0.977658,0.368084,0.507937,0.496933,0.010022,0.087195,0.181568,0.63,0.553015,0.003759,...,0,0,0,0,0,0,0,0,0,0
3796,0.998906,0.487486,0.613757,1.000000,0.014172,0.081649,0.000000,0.95,0.689189,0.001234,...,0,0,0,0,0,0,0,0,0,0


In [39]:
c_dt = c_1h.drop(['unitid', 'lat_y'],axis=1) 
c_dt

Unnamed: 0,awards_per_state_value,awards_per_natl_value,exp_award_value,exp_award_state_value,exp_award_natl_value,exp_award_percentile,ft_pct,fte_value,fte_percentile,med_sat_value,...,awards_per_value_75.5,awards_per_value_76.4,awards_per_value_76.8,awards_per_value_77.0,awards_per_value_77.5,awards_per_value_80.4,awards_per_value_90.8,awards_per_value_128.7,awards_per_value_131.1,awards_per_value_137.6
0,0.275132,0.306748,0.019941,0.359141,0.541284,0.90,0.935551,0.030646,0.33,0.180876,...,0,0,0,0,0,0,0,0,0,0
1,0.275132,0.306748,0.025851,0.359141,0.541284,0.97,0.716216,0.079120,0.67,0.552995,...,0,0,0,0,0,0,0,0,0,0
2,0.257496,0.368098,0.011059,0.452754,1.000000,0.30,0.612266,0.002065,0.12,,...,0,0,0,0,0,0,0,0,0,0
3,0.275132,0.306748,0.012196,0.359141,0.541284,0.61,0.733888,0.039303,0.40,0.592166,...,0,0,0,0,0,0,0,0,0,0
4,0.275132,0.306748,0.025067,0.359141,0.541284,0.96,0.906445,0.039580,0.41,0.188940,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3793,0.320988,0.368098,0.004553,0.770853,1.000000,0.03,0.319127,0.000277,0.01,,...,0,0,0,0,0,0,0,0,0,0
3794,0.338624,0.368098,0.020061,0.643289,1.000000,0.72,0.917879,0.000870,0.04,0.464286,...,0,0,0,0,0,0,0,0,0,0
3795,0.507937,0.496933,0.010022,0.087195,0.181568,0.63,0.553015,0.003759,0.40,,...,0,0,0,0,0,0,0,0,0,0
3796,0.613757,1.000000,0.014172,0.081649,0.000000,0.95,0.689189,0.001234,0.13,,...,0,0,0,0,0,0,0,0,0,0


In [46]:
prevalence = c_1h.control_Public.value_counts()[1]/len(c_1h.control_Public)
prevalence

0.41021590310689837

In [48]:
print(c_1h.control_Public.value_counts())
print(1558/(1558+2240)) 

0    2240
1    1558
Name: control_Public, dtype: int64
0.41021590310689837


In [55]:
X = c_1h.drop(columns=['control_Public', 'control_Private for-profit']) 
y = c_1h['control_Public'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)

X_tune, X_test, y_tune, y_test = train_test_split(X_test, y_test, train_size=0.5, stratify=y_test)

print(f"Train set: {X_train.shape[0]} rows")
print(f"Tune set: {X_tune.shape[0]} rows")
print(f"Test set: {X_test.shape[0]} rows")

print(f"Train set prevalence: {y_train.mean()}")
print(f"Tune set prevalence: {y_tune.mean()}")
print(f"Test set prevalence: {y_test.mean()}")

Train set: 3038 rows
Tune set: 380 rows
Test set: 380 rows
Train set prevalence: 0.41013824884792627
Tune set prevalence: 0.4105263157894737
Test set prevalence: 0.4105263157894737


Step three: What do your instincts tell you about the data. Can it address your problem, what areas/items are you worried about? 

The data encompasses several different features that could likely predict whether an institution is public or private based on financial performance. However because of the surplus of variables, there is potential for confounding factors to come into play such as geographical location, and other non-financial aspects that could also influence the type of control. If these factors are not included in the model, it might not be able to fully capture the reasons behind an institution's classification as public or private, which could lead to over-fitting and a false correlation between financial performance and the type of university. 


Pipeline function applicable to both dataset examples:

In [None]:
def clean_data(df, continuous_columns, categorical_columns, target_column):
    df[categorical_columns] = df[categorical_columns].apply(lambda x: x.astype('category')) #changes the type of variable to category

    scaler = MinMaxScaler()
    df[continuous_columns] = scaler.fit_transform(df[continuous_columns]) #scales continuous variables

    df_1h = pd.get_dummies(df, columns=categorical_columns) #one-hot encoding for categorical variables

    X = df_1h.drop(columns=[target_column]) #identifies the feature and target variable/s
    y = df_1h[target_column]
    #train/test split for the model
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)
    X_tune, X_test, y_tune, y_test = train_test_split(X_test, y_test, train_size=0.5, stratify=y_test)

    return X_train, X_test, X_tune, y_train, y_test, y_tune