In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# helps in creating the graphs in the notebook otherwise they will pop up

In [None]:
data = pd.read_csv('student.csv')
display (data.head())

In [None]:
data.shape

In [None]:
data.info()

# Null Count - missing value in the dataset

# Data Dictionary

Attribute Information ::

Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets: 
1.	school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) 
2.	sex - student's sex (binary: 'F' - female or 'M' - male) 
3.	age - student's age (numeric: from 15 to 22) 
4.	address - student's home address type (binary: 'U' - urban or 'R' - rural) 
5.	famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) 
6.	Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) 
7.	Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education) 
8.	Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education) 
9.	Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
10.	Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
11.	reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') 
12.	guardian - student's guardian (nominal: 'mother', 'father' or 'other') 
13.	traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 
14.	studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 
15.	failures - number of past class failures (numeric: n if 1<=n<3, else 4) 
16.	schoolsup - extra educational support (binary: yes or no) 
17.	famsup - family educational support (binary: yes or no) 
18.	paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 
19.	activities - extra-curricular activities (binary: yes or no) 
20.	nursery - attended nursery school (binary: yes or no) 
21.	higher - wants to take higher education (binary: yes or no) 
22.	internet - Internet access at home (binary: yes or no) 
23.	romantic - with a romantic relationship (binary: yes or no) 
24.	famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 
25.	freetime - free time after school (numeric: from 1 - very low to 5 - very high) 
26.	goout - going out with friends (numeric: from 1 - very low to 5 - very high) 
27.	Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 
28.	Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 
29.	health - current health status (numeric: from 1 - very bad to 5 - very good) 
30.	absences - number of school absences (numeric: from 0 to 93) 

## isnull()
- The isnull() function is used to detect missing values for an array-like object.

- This function takes a scalar or array-like object and indicates whether values are missing (NaN in numeric arrays)

In [None]:
data.isnull()

# it will only check for NAN and not BLANK VALUES
# TRUE = NAN = Missing Value

In [None]:
data.isnull().sum() # col wise sum ===  T = 1 and F = 0

## Deletion

In [None]:
data_del = data.dropna( how = "any") # any row that has a missing value drop it

In [None]:
print ('Before: ', data.shape, 'After: ', data_del.shape)

In [None]:
round((395 - 182)/395, 2)

**We will end up deleting 54% of data.**

**We will lose 54% of the data, this will hamper the analysis, since we require large amount of data for Predictive Analysis.** 

## Imputation of Continuous Values

In [None]:
sns.displot(data.absences, kde = True);

#### Since, the data is positively skewed, we will use the median to impute the missing values

In [None]:
data.absences.median()

In [None]:
data.absences.head()

- **The fillna() function is used to fill NA/NaN values using the specified method.**

In [None]:
data.absences.fillna(data.absences.median(), inplace = True)
data.absences.head()

In [None]:
data.absences.isnull().sum() # total Number of missing value in the absences column

## Imputation for Categorical Variables

In [None]:
data.activities.head(10)

In [None]:
data.activities.mode() # multiple modes in a dataset

In [None]:
data.activities.mode()[0]

In [None]:
data.activities.fillna(data.activities.mode()[0], inplace = True)
data.activities.head(10)

# Q. Famrel - impute the missing values

# Outliers Detection

In [None]:
sns.boxplot(x = data.absences).set(title = "Boxplot of Absences Column");
# x - Horizontal Boxplot

In [None]:
min_value = data['absences'].min()
Q1 = data['absences'].quantile(0.25)
median_value = data['absences'].median()
Q3 = data['absences'].quantile(0.75)
max_value = data['absences'].max()


print ("min_value    :", min_value)
print ("Q1           :", Q1)
print ("median_value :", median_value)
print ("Q3           :", Q3)
print ("max_value    :", max_value)


**Not Outliers = (Q1 - 1.5 * IQR) <= data <= (Q3 + 1.5 * IQR)**
- Q1 - 1st Quartile
- Q3 - 3rd Quartile
- Q3-Q1 - IQR

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# data points less than lower_limit are outliers
# data points greater than upper limit are outliers

print (lower_limit)
print (upper_limit)

## Deleting Outliers

In [None]:
data.loc[data.absences < lower_limit]

In [None]:
data.loc[data.absences > upper_limit]
len(data.loc[data.absences > upper_limit])

In [None]:
25/395*100

In [None]:
data.loc[data.absences > upper_limit].index

In [None]:
data = data.loc[data.absences <= upper_limit]
data.shape

In [None]:
sns.boxplot(y = data.absences).set(title = "Boxplot of Absences Column");
# y - Vertical Boxplot

In [None]:
min_value = data['absences'].min()
Q1 = data['absences'].quantile(0.25)
median_value = data['absences'].median()
Q3 = data['absences'].quantile(0.75)
max_value = data['absences'].max()


print ("min_value    :", min_value)
print ("Q1           :", Q1)
print ("median_value :", median_value)
print ("Q3           :", Q3)
print ("max_value    :", max_value)
