### Package versions used:

Python 3.8

pdfplumber 0.10.2

Numpy 1.21.5

Pandas 1.4.3

In [3]:
import numpy as np
import pandas as pd
import pdfplumber


## Enter your filename here
Make sure it is placed in the same folder as this notebook

In [4]:
pdf_path = 'NTNU grade transcript.pdf' #local path. Enter your filename here

#extract text info
with pdfplumber.open(pdf_path) as pdf:
    text_data = ""
    for page in pdf.pages:
        text_data += page.extract_text()


In [10]:
columns = ["Course code", "Letter grade", "Number grade", "Course credits"]
df = pd.DataFrame(columns=columns)

lines = text_data.split("\n")
for line in lines:
    # NTNU courses either have two or three upper case letters, such as TMA4100 og FY1001.
    # This filters out the random text in the transcript
    # Not super robust, but it works
    if (line[0:2].isalpha()) and (line[0:2].isupper()):
        words = line.split()
        # st.markdown(words)
        
        #these three lines make more sense if you print the "words" variable
        code = words[0]
        letter_grade = words[-1]
        course_credits = words[-3]


        def num_there(s):
            return any(i.isdigit() for i in s)
        # This handles weird courses with 0 credits such as HMS0001
        if (len(course_credits) > 3) or (not num_there(course_credits)):
            course_credits = np.nan
        
        #duplicate courses where the second has zero credits behaves differently
        # if course_credits == '—':
        #     course_credits = np.nan
        
        
        # if not num_there(course_credits):
        #     course_credits = np.nan
        
        # Convert letter grades to numbers.
        if len(letter_grade) == 1:
            grade = 70 - ord(letter_grade)  # A capital F has ASCII value 70

        # This handles the "passed" grade.
        else:
            grade = np.nan

        # Add course info to the dataframe
        df.loc[df.shape[0]] = [code, letter_grade, grade, course_credits]
        
# We must convert the credits from 7,5 to 7.5
course_credits_values = np.array(df["Course credits"].values)
# st.markdown(course_credits_values)
# ugly, but does the job
for i in range(len(course_credits_values)):
    course_credits_values[i] = float(
        str(course_credits_values[i]).replace(",", ".")
    )
df["course_values"] = course_credits_values

# We have to ignore the course credits corresponding to course without letter grade when calculating a weighted average
Total_credits_in_courses_with_grade = (
    df["course_values"].sum() - df[df["Number grade"].isna()]["course_values"].sum()
)

# Calculate weighted average. This is done by multiplying the grade by the credits, dividing by all the relevant credits.
# Note that pandas df will automatically consider NaNs as 0 when summing, which makes the numerator nice.
# The reason why the denominator needs the ugly solution is because there exist courses with credits and no grade, which are not removed by being multiplied by a nan (such as in the denominator)
avg_grade = (df["course_values"] * df["Number grade"]).sum() / (
    Total_credits_in_courses_with_grade
)

print(f" ### Average grade: {avg_grade:.2f} ###")
unweighted_average = df["Number grade"].mean()

print("\nVerifying the number\n")
print(
    "To be sure that the average is correct, you can do some checks to verify that the PDF was parsed correctly:"
)
print(
    f"* Check that you have **{df.shape[0]}** courses on your original transcript"
)
print(
    '* Verify that any pass/fail courses have been converted to "None" in the Number Grade column'
)
print("* Verify that the course credits are correct")
print(
    "* Check that unusual courses such as HMS0001 has no credits and no grade"
)
display(
    df[["Course code", "Letter grade", "Number grade", "Course credits"]],
)


 ### Average grade: 4.07 ###

Verifying the number

To be sure that the average is correct, you can do some checks to verify that the PDF was parsed correctly:
* Check that you have **37** courses on your original transcript
* Verify that any pass/fail courses have been converted to "None" in the Number Grade column
* Verify that the course credits are correct
* Check that unusual courses such as HMS0001 has no credits and no grade


Unnamed: 0,Course code,Letter grade,Number grade,Course credits
0,TDT4110,B,4.0,7.5
1,TMT4100,B,4.0,7.5
2,TMA4100,B,4.0,7.5
3,TMA4105,Pass,,7.5
4,TKT4116,Pass,,7.5
5,TVM4101,Pass,,7.5
6,EXPH0300,A,5.0,7.5
7,TMA4110,Pass,,7.5
8,TMA4240,C,3.0,7.5
9,FY1001,B,4.0,7.5
