In [190]:
import numpy as np
import pandas as pd
import pdfplumber
# import re

In [191]:
pdf_path = 'NTNU vitnemål.pdf' #local path
# pdf_path = "NTNU grade transcript.pdf"

#extract text info
with pdfplumber.open(pdf_path) as pdf:
    text_data = ""
    for page in pdf.pages:
        text_data += page.extract_text()


In [192]:
#df to be filled in later
columns = ['Course code', 'Letter grade','Grade', 'Course credits']
df = pd.DataFrame(columns=columns)



lines = text_data.split("\n")
for line in lines:
    
    #NTNU courses either have two or three upper case letters, such as TMA4100 og FY1001. 
    #This filters out the random text in the transcript
    #Not super robust, but it works
    if (line[0:2].isalpha()) and (line[0:2].isupper()):
        words = line.split()
        code = words[0]
        letter_grade = words[-1]
        course_credits = words[-3]
        
        #This handles weird courses with 0 credits such as HMS0001
        if len(course_credits) > 3: 
            course_credits = np.nan
            
        #Convert letter grades to numbers. 
        if len(letter_grade)==1:
            grade = 70 - ord(letter_grade) #A capital F has ASCII value 70 
        
        #This handles the "passed" grade.
        else: 
            grade = np.nan
            
        #Add course info to the dataframe
        df.loc[df.shape[0]] = [code,letter_grade, grade, course_credits]




In [193]:
#We must convert the credits from 7,5 to 7.5

course_credits_values = np.array(df["Course credits"].values)

#ugly, but does the job
for i in range(len(course_credits_values)): course_credits_values[i] = float(str(course_credits_values[i]).replace(",", "."))
df["course_values"] = course_credits_values

In [194]:

#We have to ignore the course credits corresponding to course without letter grade when calculating a weighted average
Total_credits_in_courses_with_grade = df["course_values"].sum() - df[df["Grade"].isna()]["course_values"].sum()

#Calculate weighted average. This is done by multiplying the grade by the credits, dividing by all the relevant credits. 
#Note that pandas df will automatically consider NaNs as 0 when summing, which makes the numerator nice.
#The reason why the denominator needs the ugly solution is because there exist courses with credits and no grade, which are not removed by being multiplied by a nan (such as in the denominator)
avg_grade = ( df["course_values"]*df["Grade"]).sum() / ( Total_credits_in_courses_with_grade )
print(f"Your course credit weighted average grade is {avg_grade:.2f}. (This is the one you report)")

unweighted_average = df["Grade"].mean()
print(f"Your unweighted average grade is {unweighted_average:.2f}.(This is just the mean of all your grades, regardless of how many credits the course gives. If all your courses are 7.5 credits, the number is identical)")

print(f"Verify that you have {df.shape[0]} courses on the uploaded grade transcript")
print("Also verify that the grades were parsed correctly. The following dataframe was extracted:")
display(df[["Course code","Letter grade", "Grade","Course credits"]])

Your course credit weighted average grade is 4.04. (This is the one you report)
Your unweighted average grade is 4.09.(This is just the mean of all your grades, regardless of how many credits the course gives. If all your courses are 7.5 credits, the number is identical)
Verify that you have 38 courses on the uploaded grade transcript
Also verify that the grades were parsed correctly. The following dataframe was extracted:


Unnamed: 0,Course code,Letter grade,Grade,Course credits
0,TDT4110,B,4.0,75.0
1,TKT4240,C,3.0,75.0
2,TMT4100,B,4.0,75.0
3,TMA4100,B,4.0,75.0
4,TKT4116,Bestått,,75.0
5,TMA4105,Bestått,,75.0
6,TVM4101,Bestått,,75.0
7,EXPH0300,A,5.0,75.0
8,TMA4110,Bestått,,75.0
9,FY1001,B,4.0,75.0
