In [None]:
import matplotlib as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [None]:
stud_df = pd.read_csv("./data/StudentsPerformance.csv")

In [None]:
stud_df

In [None]:
stud_df.info()

In [None]:
stud_df = stud_df.rename(
    columns={
        "gender": "gender",
        "race/ethnicity": "social_group",
        "parental level of education": "parent_education",
        "lunch": "lunch",
        "test preparation course": "test_prep_course",
        "math score": "math_score",
        "reading score": "read_score",
        "writing score": "write_score",
    }
)

In [None]:
stud_df.columns

In [None]:
stud_df.info()

# Adding Grade Column

In [None]:
def AverageToGrade(AverageScore):
    if AverageScore >= 80:
        return "A"
    if AverageScore >= 70:
        return "B"
    if AverageScore >= 60:
        return "C"
    if AverageScore >= 50:
        return "D"
    if AverageScore >= 40:
        return "E"
    else:
        return "F"

## Setting average

In [None]:
stud_df["average"] = stud_df[["math_score", "read_score", "write_score"]].mean(axis=1)
stud_df.head()

In [None]:
stud_df["grade"] = stud_df.apply(lambda x: AverageToGrade(x["average"]), axis=1)
stud_df.head(20)

In [None]:
sns.countplot(data=stud_df, x=stud_df["grade"], hue=stud_df["gender"])

In [None]:
gender_mean = (
    stud_df.groupby("gender")[["math_score", "read_score", "write_score", "average"]]
    .mean()
    .round(2)
)
gender_mean

In [None]:
fig = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=["", "Male", "Female"],
                line_color="darkslategray",
                fill_color="lightskyblue",
                align="center",
                height=40,
                font_size=20,
            ),
            cells=dict(
                values=[
                    gender_mean.T.index,
                    gender_mean.T["male"],
                    gender_mean.T["female"],
                ],
                line_color="darkslategray",
                fill_color="lightcyan",
                align="center",
                height=40,
                font_size=20,
            ),
        )
    ]
)

fig.update_layout(width=800, height=400)
fig.show()

In [None]:
sns.displot(stud_df, x="math_score", hue="gender", kind="kde", fill="true")
print("Average: %d" % stud_df["math_score"].median())

In [None]:
gender_mean.reset_index(inplace=True)
fig = plt.pyplot.figure()
gs = fig.add_gridspec(4, hspace=0)

figure, axes = plt.pyplot.subplots(1, 4, sharex=True, sharey=True, figsize=(18, 6))
figure.suptitle("Mean Score Visualize by Gender")
sns.barplot(x="gender", y="math_score", data=gender_mean, palette="pastel", ax=axes[0])
axes[0].set_title("Math Score")
sns.barplot(x="gender", y="read_score", data=gender_mean, palette="pastel", ax=axes[1])
axes[1].set_title("Reading Score")
sns.barplot(x="gender", y="write_score", data=gender_mean, palette="pastel", ax=axes[2])
axes[2].set_title("Writing Score")
sns.barplot(x="gender", y="average", data=gender_mean, palette="pastel", ax=axes[3])
axes[3].set_title("Average Score")

# Explore Categorical Variables


In [None]:
unique_df = pd.DataFrame(columns=["VariableName", "UniqueValues"])
object_columns = list(
    filter(
        lambda col: str(stud_df[col[1]].dtype) == "object", enumerate(stud_df.columns)
    )
)
object_columns

In [None]:
for col in object_columns:
    unique_df.loc[col[0], ["VariableName", "UniqueValues"]] = [
        col[1],
        ", ".join(stud_df[col[1]].unique()),
    ]

In [None]:
pd.options.display.max_colwidth = 100

unique_df

In [None]:
stud_df.describe()

In [None]:
stud_df.describe(include="all")

# Lebeling Categorical Variables
## social_group

0 ---> group A

1 ---> group B

2 ---> group C

3 ---> group D

4 ---> group E

## parent_education

0 ---> associate's degree

1 ---> bachelor's degree

2 ---> high school

3 ---> master's degree

4 ---> some college

5 ---> some high school

## Gender

0 ---> female

1 ---> male

## Grade
A ---> 0
B ---> 1
C ---> 2
D ---> 3
E ---> 4

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in object_columns:
    lbl = LabelEncoder()
    lbl.fit(list(stud_df[col[1]].values))
    stud_df[col[1]] = lbl.transform(stud_df[col[1]].values)
stud_df.head(10).style.background_gradient(cmap="Set2")
stud_df.describe()

In [None]:
stud_df.head(11).style.background_gradient(cmap="Set2")

In [None]:
stud_df.to_csv("./data/new_dataset.csv")

In [None]:
new_df = pd.read_csv("./data/new_dataset.csv")
print(new_df.shape)
new_df.head()

In [None]:
grade_labels = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F"}

In [None]:
X = new_df.iloc[:, 5:-2]
X.values[:10]

In [None]:
y = new_df.iloc[:, -1]
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print(regressor.intercept_)
print(regressor.coef_)
y_pred = regressor.predict(X_test)
df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
df
poly_reg = PolynomialFeatures(degree=4)
x_poly = poly_reg.fit_transform(X_train)
pol_reg = LinearRegression()
pol_reg.fit(x_poly, y_train)

In [None]:
y_pred = list(map(lambda x: int(x.round()), y_pred))

In [None]:
df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
df

In [None]:
y_pred = pol_reg.predict(poly_reg.fit_transform(X_test))
y_pred = list(map(lambda x: int(x.round()), y_pred))
df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
df

In [None]:
plt.pyplot.plot(X, y, color="red")
plt.pyplot.plot(X_test, y_pred, color="blue")
plt.pyplot.show()

In [None]:
from sklearn import metrics

print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))