# ========================================

# Import Dependencies

# ========================================

In [1]:
##################################################################################################
#                            Miscellaneous Operating System Interfaces
##################################################################################################
import os
##################################################################################################
#                       Data Structure and Manipulation (Arrays & Dataframes)
##################################################################################################
import numpy as np
import pandas as pd
##################################################################################################
#                                     Data Visualization
##################################################################################################
import seaborn as sns
# from matplotlib import cm
import matplotlib.pyplot as plt
##################################################################################################
#                                    Statistics Libraries
##################################################################################################
import statistics
import scipy

# ========================================

# File Paths Used in the Notebook

# ========================================

In [2]:
# Format the Input File Path for the Appropriate Operating System
Resources_file_path = os.path.join("Resources", "Data")
Resources_file_path

'Resources/Data'

# ========================================

# Step 1 - Import Student Data

# ========================================

In [3]:
# CSV File Name
student_grade_prediction_csv_file_name = "Student_Grade_Prediction_Data.csv"

In [4]:
# Combine the File Path and the File Name for the Appropriate Operating System
student_grade_prediction_csv_file_path = os.path.join(Resources_file_path, student_grade_prediction_csv_file_name)
student_grade_prediction_csv_file_path

'Resources/Data/Student_Grade_Prediction_Data.csv'

In [5]:
# - Show All Columns/Rows of a Pandas Dataframe
#     - https://towardsdatascience.com/how-to-show-all-columns-rows-of-a-pandas-dataframe-c49d4507fcf
#     - https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html

pd.set_option('display.max_columns', None)

In [6]:
# Read in the CSV File as a Dataframe
student_grade_prediction_df = pd.read_csv(student_grade_prediction_csv_file_path)

student_grade_prediction_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


# ========================================

# Step 2 - Check Data for Nulls

# ========================================

In [None]:
# Check the Dataframe for Null Values
student_grade_prediction_df_check_nan = student_grade_prediction_df.isnull().values.any()

if student_grade_prediction_df_check_nan == True:
    print("The Dataframe has Null Values!!!!!")
#     student_grade_prediction_df_check_nan = student_grade_prediction_df['API'].isnull().sum()
#     print(f"There are {heart_df_count_nan} null values in the dataframe.")

# ========================================

# Step 3 - Evaluate the Column Headers and Their Data Types

# ========================================

In [None]:
# Column Headers

student_grade_prediction_df.keys()

In [None]:
# Column Headers and Their Data Types

# Reference:
#     - Convert a Series to a Dataframe
#         - https://pandas.pydata.org/docs/reference/api/pandas.Series.to_frame.html

student_grade_prediction_dtypes_df = student_grade_prediction_df.dtypes.to_frame().reset_index()
student_grade_prediction_dtypes_df = student_grade_prediction_dtypes_df.rename(columns={"index": "Column_Headers", 0: "dtype"})

print(type(student_grade_prediction_dtypes_df))
student_grade_prediction_dtypes_df

## Step 3.1 Column Header Details

school - binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira

sex - binary: 'F' - female or 'M' - male

age - numeric: from 15 to 22

address - student's home address type (binary: 'U' - urban or 'R' - rural)

famsize - binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3

Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)

Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary

Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary

Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or

Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or

reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')

guardian - student's guardian (nominal: 'mother', 'father' or 'other')

traveltime - home to school travel time (numeric: 1 - &lt;15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - &gt;1 hour)

studytime - weekly study time (numeric: 1 - &lt;2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - &gt;10 hours)

failures - number of past class failures (numeric: n if 1&lt;=n&lt;3, else 4)

schoolsup - extra educational support (binary: yes or no) 

famsup - family educational support (binary: yes or no)

paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

activities - extra-curricular activities (binary: yes or no)

nursery - attended nursery school (binary: yes or no)

higher - wants to take higher education (binary: yes or no)

internet - Internet access at home (binary: yes or no)

romantic - with a romantic relationship (binary: yes or no)

famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

freetime - free time after school (numeric: from 1 - very low to 5 - very high)

goout - going out with friends (numeric: from 1 - very low to 5 - very high)

Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

health - current health status (numeric: from 1 - very bad to 5 - very good)

absences - number of school absences (numeric: from 0 to 93)

G1 - first period grade (numeric: from 0 to 20)

G2 - second period grade (numeric: from 0 to 20)

G3 - final grade (numeric: from 0 to 20)

# ========================================

# Step 4 - Check the Statistics of the Data

# ========================================

# Step 4.1 - Intial Statistics Using Pandas Describe Function

In [None]:
# Reference:
#     - Pandas Describe
#         - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html

# percentile list
perc =[.20, .40, .60, .80]

# list of dtypes to include
include =['object', 'float', 'int']

student_grade_prediction_df_dscrb = student_grade_prediction_df.describe(percentiles = perc, include = include)

student_grade_prediction_df_dscrb

# Step 4.2 - Intial Statistics Using Pandas Describe Function

In [None]:
student_grade_prediction_df.value_counts(student_grade_prediction_df.age)

# Step 4.1.1 - Observations from the intial Statistics

In [None]:


for dtype_indx in student_grade_prediction_dtypes_df.index:
#     print(dtype_indx)

    if student_grade_prediction_dtypes_df["dtype"][dtype_indx] == "int64":
#         print(student_grade_prediction_dtypes_df["Column_Headers"][dtype_indx])
#         print(student_grade_prediction_dtypes_df["dtype"][dtype_indx])

        clmn_hdrs_nm = student_grade_prediction_dtypes_df["Column_Headers"][dtype_indx]

        print(f"{clmn_hdrs_nm} Min: {min(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Max: {max(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Mean: {statistics.mean(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Median: {statistics.median(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Mode: {statistics.mode(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Variance: {statistics.variance(student_grade_prediction_df[clmn_hdrs_nm])}")
        
        zscore_ndarray =  scipy.stats.zscore(student_grade_prediction_df[clmn_hdrs_nm])
        zscore_df = pd.DataFrame(zscore_ndarray)
#         print(f"{clmn_hdrs_nm} Z-Score: {type(scipy.stats.zscore(student_grade_prediction_df[clmn_hdrs_nm]))}")

#         ax = student_grade_prediction_df.plot(x = clmn_hdrs_nm, y = ""

        print("----------------------------------------------------")

In [None]:
zscore_df.groupby([0]).count()

In [None]:
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")

# ========================================

# Step 3 - Manipulate the Data for Plots

# ========================================

# ========================================

# Step 3 - Plots the Data

# ========================================

## Step 3.1 - Quick Scatter Plots and Histograms 

In [None]:
# Reference: 
#     - Seaborn.pairplot()
#         - https://seaborn.pydata.org/generated/seaborn.pairplot.html

sns.pairplot(data = student_grade_prediction_df)

## Step 3.2 - Boxplots

### Step 3.2.1 - Seaborn Boxplots

In [None]:
# References:
#     - Seaborn Boxplot
#         - https://seaborn.pydata.org/generated/seaborn.boxplot.html
#     - Use swarmplot() to show datapoints on top of the boxes
#         - https://seaborn.pydata.org/generated/seaborn.swarmplot.html#seaborn.swarmplot


for indx in student_grade_prediction_dtypes_df.index:
#     print(indx)
    if student_grade_prediction_dtypes_df["dtype"][indx] == "int64":
        clmn_hdr = student_grade_prediction_dtypes_df['Column_Headers'][indx]
        
        print(f"{clmn_hdr}")


        sns.set(style="whitegrid")
        tips = sns.load_dataset("tips")

        ax = sns.boxplot(data=student_grade_prediction_df[clmn_hdr], 
                         showfliers = False, 
                         showmeans = True)

        ax = sns.swarmplot(data=student_grade_prediction_df[clmn_hdr], 
                           color=".25")

        plt.show()

### Step 3.2.2 - Matplotlib Boxplots

In [None]:
for indx in student_grade_prediction_dtypes_df.index:
#     print(indx)
    if student_grade_prediction_dtypes_df["dtype"][indx] == "int64":
        clmn_hdr = student_grade_prediction_dtypes_df['Column_Headers'][indx]
        
        print(f"{clmn_hdr}")

        plt.figure(figsize = (10,5))
        plt.title(clmn_hdr)
        plt.ylabel(clmn_hdr)
        plt.boxplot(student_grade_prediction_df[clmn_hdr], showmeans = True)
        plt.show()

## Step 3.3 - Histograms

In [None]:
# References:
#     - Setting Plot Background Color in Seaborn
#         - https://stackoverflow.com/questions/25238442/setting-plot-background-colour-in-seaborn
#     - Title a Seaborn Plot 
#         - https://www.statology.org/seaborn-title/
#     - numpy.mean()
#         - https://numpy.org/doc/stable/reference/generated/numpy.mean.html
#     - Add Vertical Lines to a Distribution Plot (sns.distplot) in Matplotlib
#         - https://www.tutorialspoint.com/how-to-add-vertical-lines-to-a-distribution-plot-sns-distplot-in-matplotlib


for indx in student_grade_prediction_dtypes_df.index:
#     print(indx)
    if student_grade_prediction_dtypes_df["dtype"][indx] == "int64":
        clmn_hdr = student_grade_prediction_dtypes_df['Column_Headers'][indx]

        clmn_hdr_data = student_grade_prediction_df[clmn_hdr]
        
        sns.set_style("ticks")
#         sns.set(rc={'axes.facecolor':'black', 'figure.facecolor':'white'})

#         plt.plot([2.8, 2.8], [0, max(student_grade_prediction_df[clmn_hdr])])
        sns.histplot(data = clmn_hdr_data, 
                     facecolor='Blue').set(title=clmn_hdr)

        plt.axvline(statistics.mean(clmn_hdr_data), color='red')
        plt.axvline(statistics.median(clmn_hdr_data), color='yellow')
        plt.axvline(statistics.mode(clmn_hdr_data), color='orange')
        
    
#         ax.set_facecolor("g")
        plt.show()

In [None]:
heart_seaborn = sns.load_dataset("tips")
heart_seaborn.head()

sns.pairplot(data = heart_seaborn, diag_kind = "kde", kind = "reg")

In [None]:
sns.set_style("darkgrid")

# Create the Default Pairplot
sns.pairplot(data = heart_df, hue = "sex", kind = "reg", vars = ["age", "thalachh"])
plt.show()

# ========================================

# Step ? - Create Tables for a Database

# ========================================

## Step ?.1 - Unique Values from Each Column 

In [34]:
empty_dic = {}
# result = pd.DataFrame()

for df_header in student_grade_prediction_df.head():
    print(df_header)
    
#     empty_dic[df_header] = pd.DataFrame(columns = ["values"])
#     print(empty_dic[df_header])
    
    clmn_unqe_rslt = student_grade_prediction_df[df_header].unique()
    print(type(clmn_unqe_rslt))
    clmn_unqe_rslt_tolst = clmn_unqe_rslt.tolist()
    print(type(clmn_unqe_rslt_tolst))
    clmn_unqe_rslt_tolst_df = pd.DataFrame(clmn_unqe_rslt_tolst)
    print(type(clmn_unqe_rslt_tolst_df))
    
    empty_dic[df_header] = clmn_unqe_rslt_tolst_df
    
#     a_series = pd.Series(clmn_unqe_rslt)
#     print(type(clmn_unqe_rslt))
#     print(clmn_unqe_rslt_tolst)
    
#     empty_dic[df_header].append(a_series, ignore_index = True)
#     print(empty_dic[df_header])
    print("----------------------------------------------------")

school
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
sex
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
age
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
address
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
famsize
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
Pstatus
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
Medu
<class 'numpy.ndarray'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
----------------------------------------------------
Fedu
<class 'numpy.ndarray'>
<class 'list'>
<c

In [36]:
empty_dic

{'school':     0
 0  GP
 1  MS,
 'sex':    0
 0  F
 1  M,
 'age':     0
 0  18
 1  17
 2  15
 3  16
 4  19
 5  22
 6  20
 7  21,
 'address':    0
 0  U
 1  R,
 'famsize':      0
 0  GT3
 1  LE3,
 'Pstatus':    0
 0  A
 1  T,
 'Medu':    0
 0  4
 1  1
 2  3
 3  2
 4  0,
 'Fedu':    0
 0  4
 1  1
 2  2
 3  3
 4  0,
 'Mjob':           0
 0   at_home
 1    health
 2     other
 3  services
 4   teacher,
 'Fjob':           0
 0   teacher
 1     other
 2  services
 3    health
 4   at_home,
 'reason':             0
 0      course
 1       other
 2        home
 3  reputation,
 'guardian':         0
 0  mother
 1  father
 2   other,
 'traveltime':    0
 0  2
 1  1
 2  3
 3  4,
 'studytime':    0
 0  2
 1  3
 2  1
 3  4,
 'failures':    0
 0  0
 1  3
 2  2
 3  1,
 'schoolsup':      0
 0  yes
 1   no,
 'famsup':      0
 0   no
 1  yes,
 'paid':      0
 0   no
 1  yes,
 'activities':      0
 0   no
 1  yes,
 'nursery':      0
 0  yes
 1   no,
 'higher':      0
 0  yes
 1   no,
 'internet':      0


In [35]:
print(empty_dic["school"])
print(type(empty_dic["school"]))

# school_df = pd.DataFrame(empty_dic["school"])
# school_df

    0
0  GP
1  MS
<class 'pandas.core.frame.DataFrame'>


# ========================================

# Step ? - Create Tables for a Database

# ========================================

## Step ?.1 - Unique Values from Each Column 