In [1]:
import pandas as pd
train_file = "data/train .csv"
mapping_file = "data/misconception_mapping.csv"

In [2]:
train_df = pd.read_csv(train_file)
mapping_df = pd.read_csv(mapping_file)
print("Train Data Preview:")
print(train_df.head())
print("\nMapping Data Preview:")
print(mapping_df.head())

Train Data Preview:
   QuestionId  ConstructId                                      ConstructName  \
0           0          856  Use the order of operations to carry out calcu...   
1           1         1612  Simplify an algebraic fraction by factorising ...   
2           2         2774            Calculate the range from a list of data   
3           3         2377  Recall and use the intersecting diagonals prop...   
4           4         3387  Substitute positive integer values into formul...   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   
3         88                       Properties of Quadrilaterals             C   
4         67                          Substitution into Formula             A   

      

In [3]:
# Function to map CorrectAnswer to corresponding AnswerText
def map_correct_answer(row):
    # Map the correct answer to the corresponding column value
    answer_column = f"Answer{row['CorrectAnswer']}Text"
    return row[answer_column]

# Apply the function to the DataFrame
train_df = pd.read_csv(train_file)
train_df['CorrectAnswerText'] = train_df.apply(map_correct_answer, axis=1)

# Display the result
train_df[['CorrectAnswer', 'CorrectAnswerText']]


Unnamed: 0,CorrectAnswer,CorrectAnswerText
0,A,\( 3 \times(2+4)-5 \)
1,D,Does not simplify
2,B,Only\nKatie
3,C,\( 90^{\circ} \)
4,A,\( 30 \)
...,...,...
1864,C,\( 23 \)
1865,B,\( 11 \div 3 \)
1866,B,\( 31 \)
1867,B,Only Katie


In [4]:
def map_misconception(misconception_id, mapping):
    if pd.notna(misconception_id):
        return mapping.get(misconception_id, "Unknown")
    return None

In [5]:
# Create a dictionary to map misconception IDs to names
misconception_dict = mapping_df.set_index('MisconceptionId')['MisconceptionName'].to_dict()
train_df['MisconceptionA'] = train_df['MisconceptionAId'].apply(map_misconception, args=(misconception_dict,))
train_df['MisconceptionB'] = train_df['MisconceptionBId'].apply(map_misconception, args=(misconception_dict,))
train_df['MisconceptionC'] = train_df['MisconceptionCId'].apply(map_misconception, args=(misconception_dict,))
train_df['MisconceptionD'] = train_df['MisconceptionDId'].apply(map_misconception, args=(misconception_dict,))


In [6]:
print("\nMapped Misconception Names:")
print(train_df[['MisconceptionA', 'MisconceptionB', 'MisconceptionC', 'MisconceptionD']].head())


Mapped Misconception Names:
                                      MisconceptionA  \
0                                               None   
1  Does not know that to factorise a quadratic ex...   
2  Believes if you changed all values by the same...   
3        Does not know the properties of a rectangle   
4                                               None   

                                      MisconceptionB  \
0                                               None   
1  Thinks that when you cancel identical terms fr...   
2                                               None   
3        Does not know the properties of a rectangle   
4                                               None   

                                      MisconceptionC  \
0                                               None   
1  Does not know that to factorise a quadratic ex...   
2  Believes if you changed all values by the same...   
3                                               None   
4                

In [10]:
# Split the dataframe into separate rows for each answer
split_rows = []
for _, row in train_df.iterrows():
    split_rows.append({
        'QuestionId_Answer': f"{row['QuestionId']}_A",
        'QuestionId': row['QuestionId'],
        'ConstructId': row['ConstructId'],
        'ConstructName': row['ConstructName'],
        'CorrectAnswer': row['CorrectAnswer'],
        'SubjectId': row['SubjectId'],
        'SubjectName': row['SubjectName'],
        'QuestionText': row['QuestionText'],
        'AnswerText': row['AnswerAText'],
        'MisconceptionId': row['MisconceptionAId'],
        'MisconceptionName': row['MisconceptionA'],
        'CorrectAnswerText': row['CorrectAnswerText']

    })
    split_rows.append({
        'QuestionId_Answer': f"{row['QuestionId']}_B",
        'QuestionId': row['QuestionId'],
        'ConstructId': row['ConstructId'],
        'ConstructName': row['ConstructName'],
        'CorrectAnswer': row['CorrectAnswer'],
        'SubjectId': row['SubjectId'],
        'SubjectName': row['SubjectName'],
        'QuestionText': row['QuestionText'],
        'AnswerText': row['AnswerBText'],
        'MisconceptionId': row['MisconceptionBId'],
        'MisconceptionName': row['MisconceptionB'],
        'CorrectAnswerText': row['CorrectAnswerText']
    })
    split_rows.append({
        'QuestionId_Answer': f"{row['QuestionId']}_C",
        'QuestionId': row['QuestionId'],
        'ConstructId': row['ConstructId'],
        'ConstructName': row['ConstructName'],
        'CorrectAnswer': row['CorrectAnswer'],
        'SubjectId': row['SubjectId'],
        'SubjectName': row['SubjectName'],
        'QuestionText': row['QuestionText'],
        'AnswerText': row['AnswerCText'],
        'MisconceptionId': row['MisconceptionCId'],
        'MisconceptionName': row['MisconceptionC'],
        'CorrectAnswerText': row['CorrectAnswerText']
    })
    split_rows.append({
        'QuestionId_Answer': f"{row['QuestionId']}_D",
        'QuestionId': row['QuestionId'],
        'ConstructId': row['ConstructId'],
        'ConstructName': row['ConstructName'],
        'CorrectAnswer': row['CorrectAnswer'],
        'SubjectId': row['SubjectId'],
        'SubjectName': row['SubjectName'],
        'QuestionText': row['QuestionText'],
        'AnswerText': row['AnswerDText'],
        'MisconceptionId': row['MisconceptionDId'],
        'MisconceptionName': row['MisconceptionD'],
        'CorrectAnswerText': row['CorrectAnswerText']
    })

split_df = pd.DataFrame(split_rows)
split_df = split_df.dropna(subset=['MisconceptionName'])
print("\nSplit Data Preview:")
print(split_df.head())



Split Data Preview:
  QuestionId_Answer  QuestionId  ConstructId  \
3               0_D           0          856   
4               1_A           1         1612   
5               1_B           1         1612   
6               1_C           1         1612   
8               2_A           2         2774   

                                       ConstructName CorrectAnswer  SubjectId  \
3  Use the order of operations to carry out calcu...             A         33   
4  Simplify an algebraic fraction by factorising ...             D       1077   
5  Simplify an algebraic fraction by factorising ...             D       1077   
6  Simplify an algebraic fraction by factorising ...             D       1077   
8            Calculate the range from a list of data             B        339   

                                         SubjectName  \
3                                             BIDMAS   
4                    Simplifying Algebraic Fractions   
5                    Simplifying Al

In [11]:
for index, item in split_df.iterrows():
    print(f"QuestionText: {item['QuestionText']}, ConstructName: {item['ConstructName']},CorrectAnswerText: {item['CorrectAnswerText']}, AnswerText: {item['AnswerText']}")


QuestionText: \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?, ConstructName: Use the order of operations to carry out calculations involving powers,CorrectAnswerText: \( 3 \times(2+4)-5 \), AnswerText: Does not need brackets
QuestionText: Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \), ConstructName: Simplify an algebraic fraction by factorising the numerator,CorrectAnswerText: Does not simplify, AnswerText: \( m+1 \)
QuestionText: Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \), ConstructName: Simplify an algebraic fraction by factorising the numerator,CorrectAnswerText: Does not simplify, AnswerText: \( m+2 \)
QuestionText: Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \), ConstructName: Simplify an algebraic fraction by factorising the numerator,CorrectAnswerText: Does not simplify, AnswerText: \( m-1 \)
QuestionText: Tom and Katie are discussing the \( 5 \) plants with these heights: