<a href="https://colab.research.google.com/github/urimtal/SDU/blob/main/Moodle_Assignment_Grading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install beautifulsoup4 openpyxl


In [2]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive')

# 👇 EDIT this path to match your Google Drive exactly
BASE = Path('/content/drive/MyDrive/2025 Fall/Data Journalism 2025/Moodle Assignments')

# If your files have different names, switch to '*.html'
FILENAME_FILTER = 'onlinetext.html'   # or '*.html'

print("Base exists:", BASE.exists())


Mounted at /content/drive
Base exists: True


In [3]:
from bs4 import BeautifulSoup
import pandas as pd

# Find files
files = list(BASE.rglob(FILENAME_FILTER)) if FILENAME_FILTER != '*.html' else list(BASE.rglob('*.html'))
print("Found HTML files:", len(files))
assert len(files) > 0, "No HTML files found. Check BASE or FILENAME_FILTER."

rows = []
for f in sorted(files):
    # Read with tolerant decoding
    try:
        html = f.read_text(encoding='utf-8', errors='ignore')
    except Exception:
        html = f.read_text(errors='ignore')
    soup = BeautifulSoup(html, 'html.parser')
    # Extract visible text (joins multiple <p> if present)
    text = soup.get_text(" ", strip=True)
    rows.append({
        "full_path": str(f),
        "filename": f.name,
        "response": text
    })

df = pd.DataFrame(rows)
print("Parsed submissions:", len(df))
df.head(3)


Found HTML files: 99
Parsed submissions: 99


Unnamed: 0,full_path,filename,response
0,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,I got the basics of data journalism. There wer...
1,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,"Today's lesson was important, as it explained ..."
2,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,Today’s introduction class with data journalis...


In [4]:
import re
from pathlib import Path

def parse_student_from_path(path: str):
    p = Path(path)
    student_folder = p.parent.name
    # Pattern: "Name Surname_12345_assignsubmission_onlinetext"
    m = re.match(r"^(?P<name>.+?)_(?P<id>\d{3,})_assignsubmission_onlinetext$", student_folder, flags=re.IGNORECASE)
    if m:
        return m.group("name").strip(), m.group("id")
    # Fallback: no ID part — just clean underscores
    return student_folder.replace("_", " ").strip(), ""

names, ids = [], []
for fp in df["full_path"]:
    name, sid = parse_student_from_path(fp)
    names.append(name)
    ids.append(sid)

df["student_name"] = names
df["student_id"]   = ids
df.head(5)


Unnamed: 0,full_path,filename,response,student_name,student_id
0,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,I got the basics of data journalism. There wer...,Aigerim Kalymzhanova,301296
1,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,"Today's lesson was important, as it explained ...",Aigerim Konysbekova,301295
2,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,Today’s introduction class with data journalis...,Akniyet Oralgazy,301290
3,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,"Today's class was informative, Open Data types...",Alina Dalbekova,301292
4,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,https://docs.google.com/document/d/1OVCoOQ0UuV...,Arsen Ilkenov,301283


In [5]:
import re
from pathlib import Path

def parse_week_and_title_from_seg(seg: str):
    s = seg.replace('—','-').replace('–','-').strip()
    # A) 'Week 5 ...'
    mA = re.search(r'\bweek\s*[-_ ]*(\d{1,2})\b', s, flags=re.IGNORECASE)
    if mA:
        w = int(mA.group(1))
        after = s[mA.end():].strip(" _-")
        title = after.split('-', 1)[0].strip()
        return w, title
    # B) '5 Week ...'
    mB = re.search(r'\b(\d{1,2})\s*[-_ ]*week\b', s, flags=re.IGNORECASE)
    if mB:
        w = int(mB.group(1))
        after = s[mB.end():].strip(" _-")
        title = after.split('-', 1)[0].strip()
        return w, title
    # C) Fallback: '05 - Title'
    mC = re.match(r'^(\d{1,2})\b\s*[-_ ]*([^-]*)', s)
    if mC:
        w = int(mC.group(1))
        if 1 <= w <= 30:  # avoid grabbing years like 2025
            title = (mC.group(2) or "").strip()
            return w, title
    return None, ""

def extract_week_from_path(path: str):
    p = Path(path)
    # Walk up folders (closest first), skipping the file name
    for seg in reversed(p.parts[:-1]):
        w, t = parse_week_and_title_from_seg(seg)
        if w is not None:
            return w, t
    return None, ""

weeks, titles = [], []
for fp in df["full_path"]:
    w, t = extract_week_from_path(fp)
    weeks.append(w)
    titles.append(t)

df["week_number"]     = weeks
df["assignment_title"] = titles
df.head(5)


Unnamed: 0,full_path,filename,response,student_name,student_id,week_number,assignment_title
0,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,I got the basics of data journalism. There wer...,Aigerim Kalymzhanova,301296,1,Attendance Reflect of the Day
1,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,"Today's lesson was important, as it explained ...",Aigerim Konysbekova,301295,1,Attendance Reflect of the Day
2,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,Today’s introduction class with data journalis...,Akniyet Oralgazy,301290,1,Attendance Reflect of the Day
3,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,"Today's class was informative, Open Data types...",Alina Dalbekova,301292,1,Attendance Reflect of the Day
4,/content/drive/MyDrive/2025 Fall/Data Journali...,onlinetext.html,https://docs.google.com/document/d/1OVCoOQ0UuV...,Arsen Ilkenov,301283,1,Attendance Reflect of the Day


In [9]:
def last_name(name: str):
    parts = [p for p in str(name).split() if p]
    return parts[-1].lower() if parts else ""

df["word_count"]   = df["response"].fillna("").str.split().apply(len)
df["last_name_key"] = df["student_name"].apply(last_name)

final_cols = ["student_name", "week_number", "assignment_title", "response", "word_count", "student_id", "full_path"]
final = (df
         .sort_values(["last_name_key", "week_number"], ascending=[True, True])
         [final_cols]
         .reset_index(drop=True))

print("✅ Final rows:", len(final))
final.head(10)

✅ Final rows: 99


Unnamed: 0,student_name,week_number,assignment_title,response,word_count,student_id,full_path
0,Daniya Akimzhanova,3,Attendance Reflect of the Day,"to create hypothesis we have to find problem, ...",62,313140,/content/drive/MyDrive/2025 Fall/Data Journali...
1,Daniya Akimzhanova,7,Attendance Reflect of the Day,we watched the video about kenya and their sit...,35,386712,/content/drive/MyDrive/2025 Fall/Data Journali...
2,Aruzhan Baimenova,1,Attendance Reflect of the Day,I was looking forward to this subject the most...,81,301282,/content/drive/MyDrive/2025 Fall/Data Journali...
3,Aruzhan Baimenova,2,Attendance Reflect of the Day,"We can use data to compare the information, an...",60,303472,/content/drive/MyDrive/2025 Fall/Data Journali...
4,Aruzhan Baimenova,3,Attendance Reflect of the Day,Today I get to know about the main questions t...,79,313133,/content/drive/MyDrive/2025 Fall/Data Journali...
5,Aruzhan Baimenova,5,Attendance Reflect of the Project Day,"I actually got what I expected, all the topics...",121,340744,/content/drive/MyDrive/2025 Fall/Data Journali...
6,Aruzhan Baimenova,6,Attendance Reflect of the Day,"Data should contain the date in right way, by ...",64,366720,/content/drive/MyDrive/2025 Fall/Data Journali...
7,Aruzhan Baimenova,7,Attendance Reflect of the Day,We watched a video project about malnutrition ...,144,386718,/content/drive/MyDrive/2025 Fall/Data Journali...
8,Dilara Balkash,1,Attendance Reflect of the Day,At today's lesson I learned new things and got...,66,301297,/content/drive/MyDrive/2025 Fall/Data Journali...
9,Dilara Balkash,2,Attendance Reflect of the Day,Metadata is data about data. When the data is ...,52,303466,/content/drive/MyDrive/2025 Fall/Data Journali...


In [10]:
OUT = str(BASE / "moodle_responses_sorted.xlsx")
final.to_excel(OUT, index=False)
print("📁 Saved:", OUT)


📁 Saved: /content/drive/MyDrive/2025 Fall/Data Journalism 2025/Moodle Assignments/moodle_responses_sorted.xlsx
