# Classmate

In [None]:
import json
from pathlib import Path

import ngender
import pandas as pd
import plotly.express as px

In [None]:
def load_json(file):
    with open(file, "r", encoding="utf-8") as f:
        return json.load(f)


college = load_json("college.json")
department = load_json("department.json")
department_graduate = load_json("department_graduate.json")
class_id = load_json("class_id.json")

In [None]:
path = Path("classmate")
classmates = pd.read_csv(path / "classmates.csv")
# classmates

In [None]:
def filter(df, condition):
    df = df.loc[condition, :]
    df = df.set_index(["E-Mail", "姓名", "學號", "系所班別"]).sort_index()
    # with pd.option_context("display.max_rows", None):
    #     display(df)
    print(f"Count: {df.index.nunique()}")
    unique_names = df.index.unique(level=1)
    print(", ".join(unique_names))

## E-mails with two or more student IDs

In [None]:
emails_with_two_or_more_ids = classmates.duplicated(
    subset="E-Mail"
) ^ classmates.duplicated(subset="學號")
filter(classmates, emails_with_two_or_more_ids)

## Names with two or more e-mails

In [None]:
names_with_two_or_more_emails = classmates.duplicated(
    subset="E-Mail"
) ^ classmates.duplicated(subset="姓名")
filter(classmates, names_with_two_or_more_emails)

## Community Students

In [None]:
community = classmates.loc[:, "系所班別"].str.contains("社會人士")
filter(classmates, community)

## UST Students

In [None]:
ust = classmates.loc[:, "系所班別"].str.contains("交大|陽明|政治|中央")
filter(classmates, ust)

In [None]:
def convert_to_str(value):
    return str(value)

In [None]:
nthu = ~(community | ust)
classmates.loc[nthu, "畢業系所"] = classmates.loc[nthu, "系所班別"].str.extract(
    "(.{3})", expand=False
)
classmates.loc[~nthu, "畢業系所"] = classmates.loc[~nthu, "系所班別"]

columns = ["username", "domain"]
classmates.loc[:, columns] = (
    classmates.loc[:, "E-Mail"]
    .str.extract(r"(.+)@(.+)")
    .set_axis(columns, axis="columns")
)

columns = ["入學年度", "學院代碼", "系所代碼", "班級代碼", "流水號"]
classmates[columns] = pd.DataFrame(columns=columns, dtype="object")
classmates.loc[nthu, columns] = (
    classmates.loc[nthu, "學號"]
    .str.extract(r"^(\d{2,3})(\d{1,2})(\d)(\d)(\d{2})$")
    .set_axis(columns, axis="columns")
)
classmates.loc[nthu, "學院代碼"] = classmates.loc[nthu, "學院代碼"].str.zfill(2)

columns = ["入學年度", "流水號"]
classmates.loc[community, columns] = (
    classmates.loc[community, "學號"]
    .str.extract(r"^X?(\d{2,3})(\d{4})S?$")
    .set_axis(columns, axis="columns")
)

# classmates

In [None]:
def foreign_student(s):
    serial_number = int(s["流水號"])
    if s["班級"] in ["單班", "清班", "華班", "梅班"]:
        if serial_number >= 95:
            return "陸生"
        elif serial_number >= 81:
            return "外籍生"
        elif serial_number >= 61:
            return "僑生"
    elif s["班級"] in ["碩班境外生"]:
        if serial_number >= 86:
            return "預留"
        elif serial_number >= 66:
            return "陸生"
        elif serial_number >= 56:
            return "TIGP"
        elif serial_number >= 21:
            return "外籍生"
        elif serial_number >= 1:
            return "僑生"
    elif s["班級"] in ["博士一般學生", "學士逕讀博士"]:
        if serial_number >= 91:
            return "外籍生"
        elif serial_number >= 86:
            return "預留"
        elif serial_number >= 81:
            return "僑生"
        elif serial_number >= 71:
            return "陸生"
        elif serial_number >= 61:
            return "TIGP"

In [None]:
classmates["學院"] = classmates["學院代碼"].map(college)
classmates["班級"] = classmates["班級代碼"].map(class_id)
classmates["碩士生"] = classmates["班級"].isin(["碩士一般學生", "碩班境外生"])
classmates["博士生"] = classmates["班級"].isin(["博士一般學生", "學士逕讀博士"])
classmates["研究生"] = classmates["碩士生"] | classmates["博士生"]
classmates["系所"] = classmates.set_index(["學院代碼", "系所代碼"]).index.map(
    pd.DataFrame(department).unstack()
)
classmates["境外生"] = classmates.dropna(subset=["流水號"]).apply(foreign_student, axis=1)
# classmates

In [None]:
def filter_class(df, class_name):
    condition = df["班級"] == class_name
    filter(df, condition)

## Early Entry Students

In [None]:
filter_class(classmates, "碩士提前入學")

## PhD Students

In [None]:
filter_class(classmates, "博士一般學生")

## Direct Pursuit PhD Students

In [None]:
filter_class(classmates, "學士逕讀博士")

## Foreign Master Students
(not necessary master student)

In [None]:
filter_class(classmates, "碩班境外生")

In [None]:
def filter_foreign(df, foreign_type):
    condition = df["境外生"] == foreign_type
    filter(df, condition)

## Overseas Compatriot Students

In [None]:
filter_foreign(classmates, "僑生")

## Foreign Students

In [None]:
filter_foreign(classmates, "外籍生")

## TIGP Students

In [None]:
filter_foreign(classmates, "TIGP")

## Chinese Students

In [None]:
filter_foreign(classmates, "陸生")

## Other Students

In [None]:
filter_foreign(classmates, "預留")

In [None]:
def guess_gender(name):
    name = name.replace(" ", "")
    try:
        gender, prob = ngender.guess(name)
        return gender  # male or female or unknown, prob
    except Exception as e:
        # print(e)
        return "error"

In [None]:
classmates["猜測性別"] = classmates["姓名"].map(guess_gender)

## Unpredictable Gender

In [None]:
unpredictable_gender = (classmates["猜測性別"] != "male") & (classmates["猜測性別"] != "female")
filter(classmates, unpredictable_gender)

## Switch Major Students

In [None]:
def switch_major(s):
    if "學士班" in s["系所"] or "跨系所招生" in s["系所"]:
        return False
    elif s["畢業系所"] == "運科系" and s["系所"] == "體育學系":
        return False
    elif set(s["畢業系所"]).issubset(s["系所"]):
        return False
    else:
        return True


classmates["轉系"] = classmates.dropna(subset=["畢業系所", "系所"]).apply(switch_major, axis=1)
filter(classmates, classmates["轉系"] == True)

## Defer Graduation Students

In [None]:
def defer_graduation(s):
    year = int(s["學年"]) - int(s["入學年度"])
    if year >= 5:
        return True
    elif s["碩士生"] and year >= 3:
        return True
    else:
        return False


classmates["延畢"] = classmates.dropna(subset=["入學年度", "學年"]).apply(
    defer_graduation, axis=1
)
filter(classmates, classmates["延畢"] == True)

In [None]:
# some classmates may have no E-Mail address, so we use student ID instead
same_class = classmates.groupby(["學號"]).size()
df = classmates.set_index("學號")
df.loc[same_class.index, "同班次數"] = same_class
classmates = df.reset_index()

In [None]:
unique_student_id = classmates.drop_duplicates(subset="學號", ignore_index=True)

In [None]:
px.pie(unique_student_id, names="學院", title="College Distribution")

In [None]:
px.pie(
    unique_student_id.dropna(subset=["境外生"]),
    names="境外生",
    title="Foreign Student Distribution",
)

In [None]:
px.pie(unique_student_id, names="猜測性別", title="Gender Distribution")

In [None]:
px.histogram(
    unique_student_id,
    x="學院",
    color="猜測性別",
    title="Student Count by College and Gender",
).update_layout(yaxis_title="人數")

In [None]:
course = "課程"
# course = "Course"

In [None]:
px.histogram(
    unique_student_id,
    x=course,
    color="猜測性別",
    title="Student Count by Course and Gender",
).update_layout(yaxis_title="人數")

In [None]:
entry_year = unique_student_id.dropna(subset=["入學年度"])
entry_year.loc[:, "入學年度"] = entry_year.loc[:, "入學年度"].astype("int")
px.histogram(
    entry_year,
    x="入學年度",
    color="學年",
    title="Student Count by Admission Year and Course Year",
).update_layout(yaxis_title="人數")

In [None]:
px.histogram(
    unique_student_id,
    x="domain",
    color="domain",
    title="Student Count by E-Mail Domain",
).update_layout(yaxis_title="人數")

In [None]:
unique_student_id_same_class = unique_student_id.loc[
    unique_student_id.loc[:, "同班次數"] > 1, :
]
unique_student_id_same_class = unique_student_id_same_class.sort_values(
    "同班次數", ascending=False
)
classmates_same_class = classmates.loc[classmates.loc[:, "同班次數"] > 1, :]
classmates_same_class = classmates_same_class.sort_values("同班次數", ascending=False)

In [None]:
px.histogram(
    unique_student_id_same_class,
    x="同班次數",
    color="系所",
    title="Student Count by Same Class Count and Department",
).update_layout(yaxis_title="人數")

In [None]:
px.histogram(
    unique_student_id_same_class,
    x="同班次數",
    color=course,
    title="Student Count by Same Class Count and Course",
).update_layout(yaxis_title="人數")

In [None]:
px.strip(
    classmates_same_class,
    x=course,
    y="同班次數",
    color="系所",
    hover_name="姓名",
    hover_data=["學號"],
    title="Same Class Count by Course",
)

In [None]:
px.strip(
    classmates_same_class,
    x="系所",
    y="同班次數",
    color=course,
    hover_name="姓名",
    hover_data=["學號"],
    title="Same Class Count by Department",
)

In [None]:
classmates.to_csv(path / "classmates_parsed.csv", index=False)