# ========================================

# Import Dependencies

# ========================================

In [1]:
##################################################################################################
#                            Miscellaneous Operating System Interfaces
##################################################################################################
import os
##################################################################################################
#                       Data Structure and Manipulation (Arrays & Dataframes)
##################################################################################################
import numpy as np
import pandas as pd
##################################################################################################
#                                     Data Visualization
##################################################################################################
import seaborn as sns
import matplotlib.pyplot as plt
##################################################################################################
#                                    Statistics Libraries
##################################################################################################
import statistics
import scipy

# ========================================

# File Paths Used in the Notebook

# ========================================

In [2]:
# Format the Input File Path for the Appropriate Operating System
Resources_file_path = os.path.join("Resources", "Data")
Resources_file_path

'Resources/Data'

# ========================================

# Step 1 - Import Student Data

# ========================================

In [3]:
# CSV File Name
student_grade_prediction_csv_file_name = "Student_Grade_Prediction_Data.csv"

In [4]:
# Combine the File Path and the File Name for the Appropriate Operating System
student_grade_prediction_csv_file_path = os.path.join(Resources_file_path, student_grade_prediction_csv_file_name)
student_grade_prediction_csv_file_path

'Resources/Data/Student_Grade_Prediction_Data.csv'

In [5]:
# Read in the CSV File as a Dataframe
student_grade_prediction_df = pd.read_csv(student_grade_prediction_csv_file_path)

student_grade_prediction_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


# ========================================

# Step 2 - Check Data for Nulls

# ========================================

In [6]:
# Check the Dataframe for Null Values
student_grade_prediction_df_check_nan = student_grade_prediction_df.isnull().values.any()

if student_grade_prediction_df_check_nan == True:
    print("The Dataframe has Null Values!!!!!")
#     student_grade_prediction_df_check_nan = student_grade_prediction_df['API'].isnull().sum()
#     print(f"There are {heart_df_count_nan} null values in the dataframe.")

# ========================================

# Step 3 - Evaluate the Column Headers and Their Data Types

# ========================================

In [7]:
# Column Headers

student_grade_prediction_df.keys()

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [8]:
# Column Headers and Their Data Types

# Reference:
#     - Convert a Series to a Dataframe
#         - https://pandas.pydata.org/docs/reference/api/pandas.Series.to_frame.html

student_grade_prediction_dtypes_df = student_grade_prediction_df.dtypes.to_frame().reset_index()
student_grade_prediction_dtypes_df = student_grade_prediction_dtypes_df.rename(columns={"index": "Column_Headers", 0: "dtype"})

print(type(student_grade_prediction_dtypes_df))
student_grade_prediction_dtypes_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Column_Headers,dtype
0,school,object
1,sex,object
2,age,int64
3,address,object
4,famsize,object
5,Pstatus,object
6,Medu,int64
7,Fedu,int64
8,Mjob,object
9,Fjob,object


## Step 3.1 Column Header Details

# ========================================

# Step 4 - Check the Statistics of the Data

# ========================================

# Step 4.1 - Intial Statistics Using Pandas Describe Function

In [11]:
# Reference:
#     - Pandas Describe
#         - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html

# percentile list
perc =[.20, .40, .60, .80]

# list of dtypes to include
include =['object', 'float', 'int']

student_grade_prediction_df.describe(percentiles = perc, include = include)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395,395,395.0,395,395,395,395.0,395.0,395,395,...,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
unique,2,2,,2,2,2,,,5,5,...,,,,,,,,,,
top,GP,F,,U,GT3,T,,,other,other,...,,,,,,,,,,
freq,349,208,,307,281,354,,,141,217,...,,,,,,,,,,
mean,,,16.696203,,,,2.749367,2.521519,,,...,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,,,1.276043,,,,1.094735,1.088201,,,...,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,,,15.0,,,,0.0,0.0,,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
20%,,,15.0,,,,2.0,1.0,,,...,3.0,2.0,2.0,1.0,1.0,2.0,0.0,8.0,8.0,8.0
40%,,,16.0,,,,2.0,2.0,,,...,4.0,3.0,3.0,1.0,2.0,3.0,2.0,10.0,10.0,10.0
50%,,,17.0,,,,3.0,2.0,,,...,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0


# Step 4.1.1 - Observations from the intial Statistics

In [None]:


for dtype_indx in student_grade_prediction_dtypes_df.index:
#     print(dtype_indx)

    if student_grade_prediction_dtypes_df["dtype"][dtype_indx] == "int64":
#         print(student_grade_prediction_dtypes_df["Column_Headers"][dtype_indx])
#         print(student_grade_prediction_dtypes_df["dtype"][dtype_indx])

        clmn_hdrs_nm = student_grade_prediction_dtypes_df["Column_Headers"][dtype_indx]
     
        print(f"{clmn_hdrs_nm} Mean: {statistics.mean(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Median: {statistics.median(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Mode: {statistics.mode(student_grade_prediction_df[clmn_hdrs_nm])}")
        print(f"{clmn_hdrs_nm} Variance: {statistics.variance(student_grade_prediction_df[clmn_hdrs_nm])}")
        
        zscore_ndarray =  scipy.stats.zscore(student_grade_prediction_df[clmn_hdrs_nm])
        zscore_df = pd.DataFrame(zscore_ndarray)
#         print(f"{clmn_hdrs_nm} Z-Score: {type(scipy.stats.zscore(student_grade_prediction_df[clmn_hdrs_nm]))}")

#         ax = student_grade_prediction_df.plot(x = clmn_hdrs_nm, y = ""

        print("----------------------------------------------------")

In [None]:
zscore_df.groupby([0]).count()

In [None]:
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")
print(f"Age Median: {statistics.median(student_grade_prediction_df['age'])}")
print(f"Age Mode: {statistics.mode(student_grade_prediction_df['age'])}")
print("----------------------------------------------------")

# ========================================

# Step 3 - Manipulate the Data for Plots

# ========================================

# ========================================

# Step 3 - Plots the Data

# ========================================

In [None]:
sns.pairplot(data = student_grade_prediction_df)

In [None]:
heart_seaborn = sns.load_dataset("tips")
heart_seaborn.head()

sns.pairplot(data = heart_seaborn, diag_kind = "kde", kind = "reg")

In [None]:
sns.set_style("darkgrid")

# Create the Default Pairplot
sns.pairplot(data = heart_df, hue = "sex", kind = "reg", vars = ["age", "thalachh"])
plt.show()