In [None]:
# import dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import multiprocessing
from multiprocessing.pool import ThreadPool as Pool
from tqdm import tqdm

In [None]:
labels = [
    'ID Code',
    'Course Title',
    'Credits (CFU / ECTS)',
    'Name',
    'Track',
    'Semester',
]

renamed_labels = {
    'Name' : 'Master',
    'Credits (CFU / ECTS)' : 'Credits'
}

In [None]:
def getURLContent(URL:str):
    request = requests.get(URL, stream=True)
    return request.content

def getContentEIC(EIC):
    return ' '.join(EIC.get_text().split())

def getRelevantInformation(URL:str):

    soup = BeautifulSoup(getURLContent(URL), 'html.parser')

    mainSection = soup.find_all('td', class_="CenterBar")[0]

    InfoBoxes = mainSection.find_all('table', class_="BoxInfoCard")

    data = {}

    for infoBox in InfoBoxes:
        labelEIC = infoBox.find_all('td', class_="ElementInfoCard1")
        valueEIC = infoBox.find_all('td', class_="ElementInfoCard2")

        for i in range(len(labelEIC)):
            label = getContentEIC(labelEIC[i])
            value = getContentEIC(valueEIC[i])
            if label == 'Credits (CFU / ECTS)':
                data[label] = float(value)
            else:
                data[label] = str(value)

    relevantData = {label:data[label] for label in labels}
    relevantData['URL'] = URL

    return relevantData

def scrapLinks(URLs):

    df = pd.DataFrame(columns=labels)

    pool = Pool(multiprocessing.cpu_count()*2)
    
    for relevantInformation in tqdm(pool.imap_unordered(getRelevantInformation, URLs), total = len(URLs), colour='green'):
        df = pd.concat([pd.DataFrame(relevantInformation, index=[0]), df.loc[:]], ignore_index=True)

    return df

In [None]:
# opening the file in read mode
linkFile = open(Path('sandbox')/'links.txt', 'r')

# reading the file
data = linkFile.read()
linkFile.close()
# replacing end splitting the text 
# when newline ('\n') is seen.
URLs = data.split("\n")

In [None]:
df = scrapLinks(URLs)
df.describe()

In [None]:
df.rename(columns=renamed_labels, inplace=True)

df.head(4)

In [None]:
df[df.duplicated(subset=['ID Code'], keep=False)].sort_values(by=['ID Code'])

In [None]:
df_nodup = df.drop_duplicates(subset=['ID Code'], keep='first')
df_nodup.head(4)

In [None]:
df_nodup['Master'] = df_nodup['Master'].apply(lambda masterName: masterName.split(' - ')[-1])
df_nodup.head(4)

In [None]:
sort_order = ['Semester', 'Master', 'Track', 'Credits']
sort_way = [True] * len(sort_order)
sort_way[sort_order.index('Semester')] = False
sort_way[sort_order.index('Credits')] = False
print(sort_way)

df_sorted = df_nodup.sort_values(by=sort_order, ascending=sort_way, ignore_index=True)
df_sorted

#df_sorted[df_sorted['Semester'] == 'Second Semester']

In [None]:
dftest = df_sorted.groupby(['Master', 'Track'], sort=False, group_keys=False)
dftest.first()

In [None]:
print(list(dftest.groups.keys()))

dftest.get_group(list(dftest.groups.keys())[2]).drop(columns=['Master', 'Track'])

In [None]:
dfGrouped = df_sorted.groupby(['Master', 'Track'], sort=False)

rowGroups = list(dftest.groups.keys())
#print(rowGroups)

dumpData = df_sorted.head(0).drop(columns=['Master', 'Track']).to_csv(sep='\t', index=False, header=True)

for i in range(len(rowGroups)):
    group = rowGroups[i]
    masterName = group[0]
    trackName = group[1]
    if masterName != rowGroups[i-1][0]:
        print(masterName)
        dumpData += masterName + '\t'*(df_sorted.shape[1]-2-1) + '\r\n'
    print(f'\t{trackName}')
    dumpData += '\t' + trackName + '\t'*(df_sorted.shape[1]-1-2-1) + '\r\n'

    truncatedDataFrame = dfGrouped.get_group(group).drop(columns=['Master', 'Track'])
    dumpData += truncatedDataFrame.to_csv(sep='\t', index=False, header=False)

In [None]:
with open(Path('sandbox')/'output.tsv', 'w', newline='') as outputFile:
    outputFile.write(dumpData)