In [1]:
# Mount Cell
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Imports
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import sys
import os
import csv
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity
from nltk.stem import WordNetLemmatizer as wnl
import sys
import random

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
!ls "/content/drive/MyDrive/cs478ModelTraining/Collection 3"

Dataset1	gradescope_method1.csv		gradescope_method3_tuned.csv
Dataset2	gradescope_method2_doc2vec.csv	stage_2_final_method3.csv
ERD_grades.csv	gradescope_method2_sbert.csv	stage2_grading_method1.csv


# GENERAL VARIABLES AND FUNCTIONS

# YOU MAY HAVE TO CHANGE THESE PATHS DEPENDING ON WHERE THEY EXIST IN YOUR GOOGLE DRIVE

In [4]:
# Important directory paths
question1_directory_path = "/content/drive/MyDrive/cs478ModelTraining/Collection 3/Dataset1"
question2_directory_path = "/content/drive/MyDrive/cs478ModelTraining/Collection 3/Dataset2"

In [5]:
def getERDGradesAndListOfSubmissionFiles():

  # get dictionary of the grades (ground truth values) --> ERD # with mapping to its grade
  graded_ERDs_CSV = pd.read_csv("/content/drive/MyDrive/cs478ModelTraining/Collection 3/ERD_grades.csv", sep='\t')
  graded_ERDS_dict = {
      row['ERD_No']: (row['dataset1_grade'], row['dataset2_grade'])
      for index, row in graded_ERDs_CSV.iterrows()
  }
  graded_ERDS_dict = dict(sorted(graded_ERDS_dict.items()))
  # 1-103 are graded, 104 - 136 are ungraded, doesn't exist --> 62, 89, 113, 120, 126 --> total 131

  # get list of text files with objects + text from OD/OCR
  question1_list_of_files =  []
  for filename in os.listdir(question1_directory_path ):
      if filename.endswith('.txt'):
          question1_list_of_files.append(filename)

  question2_list_of_files =  []
  for filename in os.listdir(question2_directory_path ):
      if filename.endswith('.txt'):
          question2_list_of_files.append(filename)

  return graded_ERDS_dict, question1_list_of_files, question2_list_of_files

# METHOD 1 CODE

In [30]:
def preprocess_all_student_submissions_per_question(question_number, question1_list_of_files, question2_list_of_files):

  question_directory_path = ""
  question_list_of_files = []
  if question_number == 1:
    question_list_of_files = question1_list_of_files
    question_directory_path = question1_directory_path
  else:
    question_list_of_files = question2_list_of_files
    question_directory_path = question2_directory_path

  final_erd_dict = {}

  stop_words = set(stopwords.words('english'))

  for erd in question_list_of_files:

    file_path = question_directory_path + "/" + erd
    list_of_erd_words = []
    with open(file_path, 'r') as file:
      lines = file.readlines()

      # Process each line by stripping newline characters
      lines = [line.strip("\n") for line in lines]

      # change all letters to lower case
      lines = [line.lower() for line in lines]

      for line in lines:

        # REMOVE RELATION and IDENT REL NAMES CODE
        if ((line[2:5] == 'rel' and line[5] == "'") or (line[2:11] == 'ident_rel' and line[11] == "'")):
            if (line[2:11] == 'ident_rel'):
              list_of_erd_words.append("ident_rel")
            else:
              list_of_erd_words.append("rel")
            continue

        word_tokens = word_tokenize(line)

        i = 1
        word_prefix = ""

        for word in word_tokens:
          if (word not in string.punctuation and word not in stop_words):

            word = word.strip(string.punctuation)

            # PREFIX CODE:
            if (i == 1):
              # if it is the first one of the line (object name), just append it, and save the object name
              word_prefix = word
              list_of_erd_words.append(word)
              continue
            else:
              # if the word (non object name) contains hyphens or underscores
              if ("_" in word or "-" in word):
                  # split them into individual words
                  split_words = re.split(r"[_\-]", word)
                  for split_word in split_words:
                    # add the prefix to each of the split words
                    list_of_erd_words.append(word_prefix + "_" + split_word)
              else:
                word_to_add = word_prefix + "_" + word
                list_of_erd_words.append(word_to_add)

          i = i + 1

    erd_number = int(erd.split('.')[0])

    # LEMMATIZATION CODE
    list_of_erd_words = [wnl().lemmatize(w) for w in list_of_erd_words]

    final_erd_dict[erd_number] = list_of_erd_words

  return final_erd_dict


In [23]:
def build_tf_idf_vectors(graded_erd_dictionary, ungraded_erd_dictionary):

  # create a combined dictionary of the graded submissions AND ungraded
  list_of_graded_ERDs = list(graded_erd_dictionary.values())
  list_of_ungraded_ERDs = list(ungraded_erd_dictionary.values())

  combined_erds = list_of_graded_ERDs + list_of_ungraded_ERDs
  combined_dictionary = corpora.Dictionary(combined_erds)

  # create BoW representation for graded and ungraded submissions separately
  BoW_corpus_graded_ERDs = [combined_dictionary.doc2bow(doc, allow_update=True) for doc in list_of_graded_ERDs]
  BoW_corpus_ungraded_ERDs = [combined_dictionary.doc2bow(doc, allow_update=True) for doc in list_of_ungraded_ERDs]

  # create TF-IDF models for both graded submissions and ungraded submissions
  tfidf_graded_ERDs = TfidfModel(BoW_corpus_graded_ERDs, smartirs='lnc')
  tfidf_ungraded_ERDs = TfidfModel(BoW_corpus_ungraded_ERDs, smartirs='lnc')

  # Convert both to their TF-IDF representation
  corpus_tfidf_graded_ERDs = tfidf_graded_ERDs[BoW_corpus_graded_ERDs]
  corpus_tfidf_ungraded_ERDs = tfidf_ungraded_ERDs[BoW_corpus_ungraded_ERDs]

  # Create similiarty index using TF-IDF representation of graded submission
  similarity_index_graded_ERDs = MatrixSimilarity(corpus_tfidf_graded_ERDs, num_features=len(combined_dictionary))

  return corpus_tfidf_graded_ERDs, corpus_tfidf_ungraded_ERDs, similarity_index_graded_ERDs


In [24]:
def compute_predicted_student_grades(corpus_tfidf_graded_ERDs, corpus_tfidf_ungraded_ERDs, similarity_index_graded_ERDs, list_of_graded_ERDs_keys, list_of_ungraded_ERDs_keys, graded_ERDs_dict, question_number, k_value):

  similarities = []

   # Iterate through ungraded submissions, for each UNGRADED submission, find similarity between ALL the other GRADED submission
  for i, tfidf_ungraded in enumerate(corpus_tfidf_ungraded_ERDs):
    # Get the similarity of the current (ith) ungraded submission to all graded submissions
    sims = similarity_index_graded_ERDs[tfidf_ungraded]

    # Store the similarity vector for the current chapter
    similarities.append(sims)

  k = k_value

  list_of_k_closest_submission_indices = []

  # now, I have a list of lists, where the overall list is length 30 for the ungraded submissions, the inner lists are length of 101, so it represents each ungraded submission and its similarity between all 101 graded submissions
  for i, sims in enumerate(similarities):
    top_k_indices = sorted(range(len(sims)), key=lambda x: sims[x], reverse=True)[:k]
    list_of_k_closest_submission_indices.append(top_k_indices)


  final_grades_dict = {}

  for i, top_k_indices in enumerate(list_of_k_closest_submission_indices):
    sum_of_grades = 0
    for index in top_k_indices:
      graded_erd_number = list_of_graded_ERDs_keys[index]
      both_fetched_grade = graded_ERDs_dict[graded_erd_number]
      fetched_grade = 0
      if (question_number == 1):
        fetched_grade = both_fetched_grade[0]
      else:
        fetched_grade = both_fetched_grade[1]
      sum_of_grades += fetched_grade

    ungraded_erd_number = list_of_ungraded_ERDs_keys[i]
    average_grade = sum_of_grades / k
    final_grades_dict[ungraded_erd_number] = average_grade

  return final_grades_dict

In [31]:
def export_predictions_to_csv(final_grades_dict_question1, final_grades_dict_question2):

  # Get sorted ERD numbers (keys) from the dictionaries
  erd_numbers = sorted(final_graded_dict_question1.keys())

  predictions = []

  # Loop through the sorted ERD numbers and fetch the grades
  for erd in erd_numbers:

    dataset1_grade = final_graded_dict_question1.get(erd)
    dataset2_grade = final_graded_dict_question2.get(erd)
    predictions.append([erd, round(dataset1_grade, 2), round(dataset2_grade, 2)])

  # Write the predictions to a CSV file
  with open('/content/drive/MyDrive/cs478ModelTraining/Collection 3/stage2_grading_method1.csv', mode='w', newline='') as file:
      writer = csv.writer(file)
      writer.writerow(['ERD_No', 'dataset1_grade', 'dataset2_grade'])
      writer.writerows(predictions)

  print("Predictions exported to stage2_grading_method1.csv!")

In [32]:
def evaluate_predicted_results(final_grades_dict, graded_ERDs_dict, question_number):

  mae_sum = 0
  n = 0

  for ungraded_key, predicted_grade  in final_grades_dict.items():
      if ungraded_key in graded_ERDs_dict:
          # Get the actual grade for the ungraded ERD
          both_fetched_grade = graded_ERDs_dict[ungraded_key]
          actual_grade = 0
          if (question_number == 1):
            actual_grade = both_fetched_grade[0]
          else:
            actual_grade = both_fetched_grade[1]

          # Calculate absolute error and accumulate
          mae_sum += abs(predicted_grade - actual_grade)
          n += 1

  # Calculate Mean Absolute Error (MAE)
  if n > 0:
      mae = mae_sum / n
      return mae

In [33]:
def get_graded_dict(question_number):
  graded_ERDS_dict, question1_list_of_files, question2_list_of_files = getERDGradesAndListOfSubmissionFiles()

  final_erd_dict = {}
  if (question_number == 1):
    final_erd_dict = preprocess_all_student_submissions_per_question(1, question1_list_of_files, question2_list_of_files)
  else:
    final_erd_dict = preprocess_all_student_submissions_per_question(2, question1_list_of_files, question2_list_of_files)

  graded_erd_dict = {}
  ungraded_erd_dict = {}

  for key in final_erd_dict:
    if key in graded_ERDS_dict:
      graded_erd_dict[key] = final_erd_dict[key]
    else:
      ungraded_erd_dict[key] = final_erd_dict[key]

  return graded_erd_dict, ungraded_erd_dict, graded_ERDS_dict



In [58]:
def predict_student_grades(question_number, k_value):

  if (question_number != 1 and question_number != 2):
    print("Invalid question number")
    return

  # THIS LINE IS FOR GRADESCOPE SUBMISSION
  graded_erd_dict, ungraded_erd_dict, graded_ERDS_dict = get_graded_dict(question_number)

  list_of_graded_ERDs_keys = list(graded_erd_dict.keys())
  list_of_ungraded_ERDs_keys = list(ungraded_erd_dict.keys())

  corpus_tfidf_graded_ERDs, corpus_tfidf_ungraded_ERDs, similarity_index_graded_ERDs = build_tf_idf_vectors(graded_erd_dict, ungraded_erd_dict)

  final_grades_dict = compute_predicted_student_grades(corpus_tfidf_graded_ERDs, corpus_tfidf_ungraded_ERDs, similarity_index_graded_ERDs, list_of_graded_ERDs_keys, list_of_ungraded_ERDs_keys, graded_ERDS_dict, question_number, k_value)

  # THIS LINE IS FOR GRADESCOPE SUBMISSION
  return final_grades_dict

In [59]:
final_graded_dict_question1 = predict_student_grades(1, 12)
final_graded_dict_question2 = predict_student_grades(2, 12)
export_predictions_to_csv(final_graded_dict_question1, final_graded_dict_question2)

Predictions exported to stage2_grading_method1.csv!


# YOU HAVE TO RUN THE BELOW CELL FOR METHOD 2 and 3!!!

In [36]:
# HAVE TO RUN THIS LINE FOR METHOD 2
graded_erd_dict_1, ungraded_erd_dict_1, NOT_USED_1 = get_graded_dict(1)
graded_erd_dict_2, ungraded_erd_dict_2, NOT_USED_2 = get_graded_dict(2)

# METHOD 2 CODE

In [None]:
# General-purpose imports
import os
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine  # For cosine similarity

# Gensim imports
from gensim.models import Doc2Vec  # For document embeddings
from gensim.models.doc2vec import TaggedDocument  # For tagging documents

# sklearn imports
from sklearn.metrics.pairwise import cosine_similarity  # Optional: Batch cosine similarity
from sklearn.neighbors import NearestNeighbors  # KNN for embeddings
from sklearn.ensemble import RandomForestRegressor  # Random Forest for Method 3
from sklearn.preprocessing import StandardScaler  # Optional: Feature scaling

# NLP imports (if preprocessing involves tokenization or stemming)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Ensure nltk resources are available
nltk.download('punkt')  # Tokenizer data
nltk.download('stopwords')  # Stopwords list


In [None]:
import nltk
nltk.download('all')


In [19]:
import nltk
nltk.download('punkt')         # For word tokenization
nltk.download('stopwords')     # For English stopwords
nltk.download('wordnet')       # For lemmatization


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
from gensim.models.doc2vec import TaggedDocument

def prepare_documents_for_doc2vec(erd_dict):
    return [TaggedDocument(words=words, tags=[str(erd_id)]) for erd_id, words in erd_dict.items()]


In [38]:
from gensim.models import Doc2Vec

def build_doc2vec_model(tagged_documents, vector_size=100, epochs=40):
    model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=epochs)
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [39]:
def compute_doc2vec_embeddings(model, erd_dict):
    return {erd_id: model.infer_vector(words) for erd_id, words in erd_dict.items()}


In [40]:
def predict_grades_using_embeddings(graded_embeddings, ungraded_embeddings, graded_ERDs_dict, k, dataset_number):
    final_grades_dict = {}

    for ungraded_id, ungraded_vector in ungraded_embeddings.items():
        # Calculate similarity (using cosine distance) between this ungraded ERD and all graded ERDs
        similarities = [
            (graded_id, 1 - cosine(ungraded_vector, graded_vector))
            for graded_id, graded_vector in graded_embeddings.items()
        ]

        # Sort by similarity and get top k neighbors
        top_k_neighbors = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

        # Average the grades of the top k most similar graded ERDs, using the specified dataset grade
        avg_grade = sum(graded_ERDs_dict[neighbor_id][dataset_number - 1] for neighbor_id, _ in top_k_neighbors) / k
        final_grades_dict[ungraded_id] = avg_grade

    return final_grades_dict


In [41]:
def approach2_grade_prediction(graded_erd_dict, ungraded_erd_dict, graded_ERDs_dict, k_value, dataset_number):
    # Prepare documents for Doc2Vec
    graded_docs = prepare_documents_for_doc2vec(graded_erd_dict)
    ungraded_docs = prepare_documents_for_doc2vec(ungraded_erd_dict)
    all_docs = graded_docs + ungraded_docs

    # Build Doc2Vec model and infer embeddings
    doc2vec_model = build_doc2vec_model(all_docs)
    graded_embeddings = compute_doc2vec_embeddings(doc2vec_model, graded_erd_dict)
    ungraded_embeddings = compute_doc2vec_embeddings(doc2vec_model, ungraded_erd_dict)

    # Predict grades for the specified dataset
    return predict_grades_using_embeddings(graded_embeddings, ungraded_embeddings, graded_ERDs_dict, k_value, dataset_number)


In [42]:
# Get graded ERD dictionary and list of files
graded_ERDs_dict, question1_list_of_files, question2_list_of_files = getERDGradesAndListOfSubmissionFiles()

# Specify k value for KNN
k_value = 12

# Run predictions for Dataset 1 grade (dataset_number=1)
final_grades_dict_dataset1 = approach2_grade_prediction(graded_erd_dict_1, ungraded_erd_dict_1, graded_ERDs_dict, k_value, dataset_number=1)
print("Predicted grades for ungraded ERDs (Dataset 1):")
print(final_grades_dict_dataset1)

# Run predictions for Dataset 2 grade (dataset_number=2)
final_grades_dict_dataset2 = approach2_grade_prediction(graded_erd_dict_2, ungraded_erd_dict_2, graded_ERDs_dict, k_value, dataset_number=2)
print("Predicted grades for ungraded ERDs (Dataset 2):")
print(final_grades_dict_dataset2)

Predicted grades for ungraded ERDs (Dataset 1):
{128: 80.33416666666668, 114: 81.22249999999998, 129: 80.66749999999999, 115: 80.33416666666665, 116: 83.77750000000002, 117: 80.445, 107: 77.88916666666667, 106: 79.33333333333333, 105: 78.44500000000001, 112: 76.66666666666667, 104: 80.0, 111: 78.55583333333333, 110: 84.89, 135: 79.1125, 108: 79.66749999999998, 109: 81.00083333333333, 134: 76.77833333333332, 121: 78.00000000000001, 136: 76.88833333333334, 122: 79.66666666666667, 123: 78.33416666666668, 127: 76.33333333333333, 132: 79.55583333333333, 133: 79.00083333333332, 130: 75.55583333333333, 131: 80.00083333333332, 125: 74.44416666666666, 119: 81.55583333333334, 118: 79.445, 124: 75.11166666666668}
Predicted grades for ungraded ERDs (Dataset 2):
{128: 86.66666666666667, 114: 86.33333333333333, 116: 92.33333333333333, 115: 93.0, 129: 84.33333333333333, 117: 91.0, 107: 89.0, 110: 91.0, 111: 94.66666666666667, 105: 85.66666666666667, 104: 92.0, 106: 88.66666666666667, 112: 92.33333333

In [43]:
def export_predictions_to_csv_method2(final_grades_dict_dataset1, final_grades_dict_dataset2, filename="gradescope_method2_doc2vec.csv"):
    # Get sorted ERD numbers (keys) from the dictionaries to ensure consistent order
    erd_numbers = sorted(final_grades_dict_dataset1.keys())

    predictions = []

    # Loop through the sorted ERD numbers and fetch grades for both datasets
    for erd in erd_numbers:
        dataset1_grade = final_grades_dict_dataset1.get(erd, None)
        dataset2_grade = final_grades_dict_dataset2.get(erd, None)
        predictions.append([erd, round(dataset1_grade, 2), round(dataset2_grade, 2)])

    # Write the predictions to the specified CSV file
    with open(f'/content/drive/MyDrive/cs478ModelTraining/Collection 3/{filename}', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['ERD_No', 'dataset1_grade', 'dataset2_grade'])
        writer.writerows(predictions)

    print(f"Predictions exported to {filename}.")

In [44]:
# Generate predictions for Dataset 1 and Dataset 2
final_grades_dict_dataset1 = approach2_grade_prediction(graded_erd_dict_1, ungraded_erd_dict_1, graded_ERDs_dict, k_value, dataset_number=1)
final_grades_dict_dataset2 = approach2_grade_prediction(graded_erd_dict_2, ungraded_erd_dict_2, graded_ERDs_dict, k_value, dataset_number=2)

# Export predictions to CSV with the file name "gradescope_method2_doc2vec.csv"
export_predictions_to_csv_method2(final_grades_dict_dataset1, final_grades_dict_dataset2)


Predictions exported to gradescope_method2_doc2vec.csv.


In [45]:
########################################################################################## SBERT #############################################################################

In [46]:
pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.2.1
    Uninstalling sentence-transformers-3.2.1:
      Successfully uninstalled sentence-transformers-3.2.1
Successfully installed sentence-transformers-3.3.1


In [60]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained S-BERT model
sbert_model = SentenceTransformer('paraphrase-distilroberta-base-v1')


# Convert ERDs to S-BERT embeddings
def get_sbert_embeddings(erd_dict):
    embeddings = {}
    for erd_id, tokens in erd_dict.items():
        text = ' '.join(tokens)  # Combine tokens into a single string
        embeddings[erd_id] = sbert_model.encode(text)
    return embeddings


In [48]:
# Generate embeddings for graded and ungraded ERDs
graded_embeddings_1 = get_sbert_embeddings(graded_erd_dict_1)
ungraded_embeddings_1 = get_sbert_embeddings(ungraded_erd_dict_1)
graded_embeddings_2 = get_sbert_embeddings(graded_erd_dict_2)
ungraded_embeddings_2 = get_sbert_embeddings(ungraded_erd_dict_2)


In [49]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def knn_grade_prediction(graded_embeddings, ungraded_embeddings, graded_ERDs_dict, k, dataset_number):
    # Convert embeddings to a matrix format for KNN
    graded_ids = list(graded_embeddings.keys())
    graded_matrix = np.array([graded_embeddings[erd_id] for erd_id in graded_ids])
    ungraded_ids = list(ungraded_embeddings.keys())
    ungraded_matrix = np.array([ungraded_embeddings[erd_id] for erd_id in ungraded_ids])

    # Set up KNN
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(graded_matrix)

    final_grades_dict = {}

    # Find k nearest neighbors and predict grades for each ungraded ERD
    for i, ungraded_vector in enumerate(ungraded_matrix):
        distances, indices = knn.kneighbors([ungraded_vector])
        neighbor_ids = [graded_ids[idx] for idx in indices[0]]

        # Calculate average grade for the k nearest neighbors
        avg_grade = np.mean([graded_ERDs_dict[neighbor_id][dataset_number - 1] for neighbor_id in neighbor_ids])
        final_grades_dict[ungraded_ids[i]] = avg_grade

    return final_grades_dict


In [50]:
def export_predictions_to_csv_sbert(final_grades_dict_dataset1, final_grades_dict_dataset2, filename="gradescope_method2_sbert.csv"):
    # Get sorted ERD numbers (keys) from the dictionaries to ensure consistent order
    erd_numbers = sorted(final_grades_dict_dataset1.keys())

    predictions = []

    # Loop through the sorted ERD numbers and fetch grades for both datasets
    for erd in erd_numbers:
        dataset1_grade = final_grades_dict_dataset1.get(erd, None)
        dataset2_grade = final_grades_dict_dataset2.get(erd, None)
        predictions.append([erd, round(dataset1_grade, 2), round(dataset2_grade, 2)])

    # Write the predictions to the specified CSV file
    with open(f'/content/drive/MyDrive/cs478ModelTraining/Collection 3/{filename}', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['ERD_No', 'dataset1_grade', 'dataset2_grade'])
        writer.writerows(predictions)

    print(f"Predictions exported to {filename}.")

In [51]:
# Predict grades for Dataset 1
final_grades_dict_dataset1 = knn_grade_prediction(
    graded_embeddings_1, ungraded_embeddings_1, graded_ERDs_dict, k=12, dataset_number=1
)
print("Predicted grades for ungraded ERDs (Dataset 1):")
print(final_grades_dict_dataset1)

# Predict grades for Dataset 2
final_grades_dict_dataset2 = knn_grade_prediction(
    graded_embeddings_2, ungraded_embeddings_2, graded_ERDs_dict, k=12, dataset_number=2
)
print("Predicted grades for ungraded ERDs (Dataset 2):")
print(final_grades_dict_dataset2)

# Export predictions to CSV with the file name "gradescope_method2_sbert.csv"
export_predictions_to_csv_sbert(final_grades_dict_dataset1, final_grades_dict_dataset2)


Predicted grades for ungraded ERDs (Dataset 1):
{128: 75.00083333333333, 114: 81.44583333333333, 129: 75.99916666666667, 115: 76.2225, 116: 74.66583333333334, 117: 78.22166666666668, 107: 79.55666666666667, 106: 79.44500000000001, 105: 75.33416666666668, 112: 78.55583333333333, 104: 80.66749999999998, 111: 81.44500000000001, 110: 79.77916666666667, 135: 81.89, 108: 81.5575, 109: 83.22333333333333, 134: 74.88833333333334, 121: 78.44416666666667, 136: 75.88916666666667, 122: 77.3325, 123: 80.11083333333333, 127: 75.6675, 132: 78.0, 133: 78.66833333333334, 130: 77.44500000000001, 131: 74.44499999999998, 125: 74.88916666666667, 119: 81.00166666666667, 118: 75.00000000000001, 124: 75.44500000000001}
Predicted grades for ungraded ERDs (Dataset 2):
{128: 87.33333333333333, 114: 88.33333333333333, 116: 91.33333333333333, 115: 91.66666666666667, 129: 84.66666666666667, 117: 90.66666666666667, 107: 90.66666666666667, 110: 89.66666666666667, 111: 92.33333333333333, 105: 88.66666666666667, 104: 87

# METHOD 3 CODE

In [52]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [53]:
import numpy as np

def extract_features(erd_dict):
    features = {}
    for erd_id, tokens in erd_dict.items():
        num_entities = sum(1 for token in tokens if "entity" in token)
        num_relationships = sum(1 for token in tokens if "rel" in token)
        num_attributes = sum(1 for token in tokens if "attr" in token)
        total_tokens = len(tokens)

        # Additional features
        avg_word_length = np.mean([len(token) for token in tokens]) if tokens else 0
        std_word_length = np.std([len(token) for token in tokens]) if tokens else 0
        unique_tokens = len(set(tokens))
        attr_entity_ratio = num_attributes / num_entities if num_entities > 0 else 0
        rel_entity_ratio = num_relationships / num_entities if num_entities > 0 else 0
        entity_token_ratio = num_entities / total_tokens if total_tokens > 0 else 0
        rel_token_ratio = num_relationships / total_tokens if total_tokens > 0 else 0
        attr_token_ratio = num_attributes / total_tokens if total_tokens > 0 else 0

        features[erd_id] = [
            num_entities, num_relationships, num_attributes, total_tokens,
            avg_word_length, std_word_length, unique_tokens, attr_entity_ratio,
            rel_entity_ratio, entity_token_ratio, rel_token_ratio, attr_token_ratio
        ]

    return features


In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


def random_forest_grade_prediction_with_tuning(graded_features, graded_ERDs_dict, ungraded_features, dataset_number):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import GridSearchCV

    # Prepare training data
    X_train, y_train = prepare_data_for_dataset(graded_features, graded_ERDs_dict, dataset_number)

    # Define the Random Forest model
    rf = RandomForestRegressor(random_state=42)

    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_rf_model = grid_search.best_estimator_
    print(f"Best Hyperparameters: {grid_search.best_params_}")

    # Prepare testing data
    X_test = [features for erd_id, features in ungraded_features.items()]

    # Predict grades for the ungraded ERDs
    predictions = best_rf_model.predict(X_test)

    # Map predictions to ERD IDs
    ungraded_erd_ids = list(ungraded_features.keys())
    final_grades_dict = {erd_id: pred for erd_id, pred in zip(ungraded_erd_ids, predictions)}

    return final_grades_dict


In [55]:
# Prepare training data for Dataset 1 or Dataset 2
def prepare_data_for_dataset(graded_features, graded_ERDs_dict, dataset_number):
    X_train = []
    y_train = []
    for erd_id, features in graded_features.items():
        X_train.append(features)
        y_train.append(graded_ERDs_dict[erd_id][dataset_number - 1])  # Use dataset1 or dataset2 grade

    return X_train, y_train


In [61]:
def export_predictions_to_csv_method3(final_grades_dict_dataset1, final_grades_dict_dataset2, filename="stage_2_final_method3.csv"):
    # Get sorted ERD numbers (keys) from the dictionaries to ensure consistent order
    erd_numbers = sorted(final_grades_dict_dataset1.keys())

    predictions = []

    # Loop through the sorted ERD numbers and fetch grades for both datasets
    for erd in erd_numbers:
      dataset1_grade = final_grades_dict_dataset1.get(erd, None)
      dataset2_grade = final_grades_dict_dataset2.get(erd, None)
      predictions.append([erd, round(dataset1_grade, 2), round(dataset2_grade, 2)])

    # Write the predictions to the specified CSV file
    with open(f'/content/drive/MyDrive/cs478ModelTraining/Collection 3/{filename}', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['ERD_No', 'dataset1_grade', 'dataset2_grade'])
        writer.writerows(predictions)

    print(f"Predictions exported to {filename}.")

In [62]:
# Predict grades for Dataset 1

graded_features_1 = extract_features(graded_erd_dict_1)
ungraded_features_1 = extract_features(ungraded_erd_dict_1)
graded_features_2 = extract_features(graded_erd_dict_2)
ungraded_features_2 = extract_features(ungraded_erd_dict_2)

graded_ERDs_dict, question1_list_of_files, question2_list_of_files = getERDGradesAndListOfSubmissionFiles()


final_grades_dict_dataset1 = random_forest_grade_prediction_with_tuning(graded_features_1, graded_ERDs_dict, ungraded_features_1, dataset_number=1)
print("Predicted grades for ungraded ERDs (Dataset 1):")
print(final_grades_dict_dataset1)

# Predict grades for Dataset 2
final_grades_dict_dataset2 = random_forest_grade_prediction_with_tuning(graded_features_2, graded_ERDs_dict, ungraded_features_2, dataset_number=2)
print("Predicted grades for ungraded ERDs (Dataset 2):")
print(final_grades_dict_dataset2)




Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Predicted grades for ungraded ERDs (Dataset 1):
{128: 75.60318173985057, 114: 79.27530706242831, 129: 78.2060468931994, 115: 80.94750773462101, 116: 74.7673011225349, 117: 78.98403318653024, 107: 79.93769487530449, 106: 77.75901288870176, 105: 75.58885605713732, 112: 94.38928991429864, 104: 90.82887060397938, 111: 80.36025173890384, 110: 75.54120835035336, 135: 81.72443420847681, 108: 80.68954301485556, 109: 77.27353593610096, 134: 75.77981632663635, 121: 81.83071987549482, 136: 72.9148086687849, 122: 79.93807741462247, 123: 74.59462326368079, 127: 71.36029255217007, 132: 79.35104706682212, 133: 86.4509236367745, 130: 75.5680986562675, 131: 76.90046928203057, 125: 76.45053280397163, 119: 79.15156688089688, 118: 73.62552731117742, 124: 76.1301119952433}
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Predicted grades

In [63]:
# Export predictions to CSV
export_predictions_to_csv_method3(final_grades_dict_dataset1, final_grades_dict_dataset2, filename="stage_2_final_method3.csv")

Predictions exported to stage_2_final_method3.csv.
