# **1. Data Collection**

**1.0- Using RAPIDS cuDF for best speed**

In [None]:
%load_ext cudf.pandas

**1.1 Importing File Formats**

In [None]:
import pandas as pd

# Importing CSV files
train_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
misconception_mapping_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

**1.21 Checking Data Types**

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [None]:
misconception_mapping_df.dtypes

**1.22 Checking Duplicates**

In [None]:
# Checking for duplicates in train.csv
train_duplicates = train_df.duplicated().sum()
print(f'Duplicates in train.csv: {train_duplicates}')

# Checking for duplicates in test.csv
test_duplicates = test_df.duplicated().sum()
print(f'Duplicates in test.csv: {test_duplicates}')

# Checking for duplicates in misconception_mapping.csv
mapping_duplicates = misconception_mapping_df.duplicated().sum()
print(f'Duplicates in misconception_mapping.csv: {mapping_duplicates}')

# **2. Data Exploration**

**2.1 Understanding the Structure**

**2.11 View the First Few Rows**

In [None]:
# View the first few rows of train.csv
train_df.head()

In [None]:
# View the first few rows of test.csv
test_df.head()

In [None]:
# View the first few rows of misconception_mapping.csv
misconception_mapping_df.head()

**2.12 View DataFrame Information**

In [None]:
# Information about train.csv
train_df.info()

In [None]:
# Information about test.csv
test_df.info()

**2.13 Summary Statistics**

In [None]:
# Information about misconception_mapping.csv
misconception_mapping_df.info()

In [None]:
# Summary statistics for train.csv
train_df.describe()

In [None]:
# Summary statistics for test.csv
test_df.describe()

In [None]:
# Summary statistics for misconception_mapping.csv
misconception_mapping_df.describe()

**2.14 View Column Names**

In [None]:
# Column names in train.csv
train_columns = train_df.columns
print(f'Train Columns: {train_columns}')

# Column names in test.csv
test_columns = test_df.columns
print(f'Test Columns: {test_columns}')

# Column names in misconception_mapping.csv
mapping_columns = misconception_mapping_df.columns
print(f'Misconception Mapping Columns: {mapping_columns}')

**2.15 Shape of the Data**

In [None]:
# Shape of train.csv
train_shape = train_df.shape
print(f'Train shape: {train_shape}')

# Shape of test.csv
test_shape = test_df.shape
print(f'Test shape: {test_shape}')

# Shape of misconception_mapping.csv
mapping_shape = misconception_mapping_df.shape
print(f'Mapping shape: {mapping_shape}')

**2.2 Missing Values**

**2.21 Identifying Missing Values**

In [None]:
# Check for missing values in train.csv
train_missing = train_df.isnull().sum()
print(f'Missing values in train.csv:\n{train_missing}')

# Check for missing values in test.csv
test_missing = test_df.isnull().sum()
print(f'Missing values in test.csv:\n{test_missing}')

# Check for missing values in misconception_mapping.csv
mapping_missing = misconception_mapping_df.isnull().sum()
print(f'Missing values in misconception_mapping.csv:\n{mapping_missing}')

**2.22 Dropping Rows or Columns with Missing Values**

In [None]:
# Dropping rows with missing values in train.csv
train_cleaned = train_df.dropna()

# Dropping columns with missing values in test.csv
treain_cleaned = train_df.dropna(axis=1)


In [None]:
# Dropping rows with missing values in train.csv
train_cleaned = test_df.dropna()

# Dropping columns with missing values in test.csv
treain_cleaned = test_df.dropna(axis=1)

In [None]:
# Dropping rows with missing values in train.csv
train_cleaned = misconception_mapping_df.dropna()

# Dropping columns with missing values in test.csv
treain_cleaned = misconception_mapping_df.dropna(axis=1)

# **3. Data Cleaning**

**3.1 Identifying Outliers using Z-Score**

In [None]:
from scipy import stats

# Calculate Z-scores for numeric columns in train.csv
z_scores_train = stats.zscore(train_df.select_dtypes(include=['float64', 'int64']))

# Identify outliers in train.csv (Z-score > 3 or < -3)
outliers_train = train_df[(z_scores_train > 3).any(axis=1) | (z_scores_train < -3).any(axis=1)]

**3.2 Remove Outliers**

In [None]:
# Remove rows with outliers based on Z-score in train.csv
train_no_outliers = train_df[(z_scores_train < 3).all(axis=1) & (z_scores_train > -3).all(axis=1)]

# **Complete Dataset in Depth Analysis**

**Visualizing Misconception Distribution**

In [None]:
import plotly.express as px

# Plotting the distribution of misconceptions in the training data
misconceptions_train = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].melt()
fig = px.histogram(misconceptions_train, x='value', title='Distribution of Misconceptions in Train Set', labels={'value':'MisconceptionId'})
fig.show()

**Analyzing Correct Answer Distribution**

In [None]:
# Plotting the distribution of correct answers
fig = px.histogram(train_df, x='CorrectAnswer', title='Distribution of Correct Answers', labels={'CorrectAnswer':'Correct Answer'})
fig.show()

**Exploring Subject vs Misconceptions**

In [None]:
# Plotting the relationship between subjects and misconceptions
fig = px.histogram(train_df, x='SubjectName', color='MisconceptionAId', barmode='group', title='Subjects vs Misconceptions', labels={'SubjectName':'Subject', 'MisconceptionAId':'Misconception A'}, height=1000)
fig.show()

**How are Misconceptions Distributed Across Different Subjects?**

In [None]:
# Grouping data by SubjectName and calculating the average misconception IDs for each subject
misconception_subject = train_df.groupby('SubjectName')[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].mean().reset_index()

# Melting the data for easy plotting
misconception_subject_melted = misconception_subject.melt(id_vars='SubjectName', var_name='Misconception', value_name='MisconceptionId')

# Plotting Misconception distribution across different subjects
fig = px.bar(misconception_subject_melted, x='SubjectName', y='MisconceptionId', color='Misconception', 
             title='How Misconceptions are Distributed Across Different Subjects', 
             height=1000,
             labels={'SubjectName': 'Subject', 'MisconceptionId': 'Average Misconception ID'})
fig.update_xaxes(tickangle=90)
fig.show()

**What is the Distribution of Questions by Constructs and Their Subjects?**

In [None]:
# Plot showing the number of questions per construct, colored by subject
fig = px.histogram(train_df, x='ConstructName', color='SubjectName', 
                   title='Distribution of Questions by Constructs and Their Subjects',
                   height=1500,
                   labels={'ConstructName':'Construct', 'SubjectName':'Subject'})
fig.update_xaxes(tickangle=90)
fig.show()

**How are Correct Answers Distributed Across Different Constructs?**

In [None]:
# Plotting the distribution of correct answers by construct
fig = px.histogram(train_df, x='ConstructName', color='CorrectAnswer', 
                   title='Correct Answer Distribution Across Different Constructs', 
                   height=1500,
                   labels={'ConstructName':'Construct', 'CorrectAnswer':'Correct Answer'})
fig.update_xaxes(tickangle=90)
fig.show()

**What is the Frequency of Misconceptions for Each Answer Option?**

In [None]:
# Combining all misconceptions (A, B, C, D) into one DataFrame
misconception_freq = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].melt(value_name='MisconceptionId')

# Plotting the frequency of each misconception across answer options
fig = px.histogram(misconception_freq, x='MisconceptionId', 
                   title='Frequency of Misconceptions for Each Answer Option', 
                   labels={'MisconceptionId':'Misconception ID'})
fig.show()

**What is the Relationship Between Misconceptions and Correct Answers?**

In [None]:
# Plotting relationship between misconception and correct answer
fig = px.histogram(train_df, x='CorrectAnswer', color='MisconceptionAId', 
                   title='Relationship Between Misconceptions and Correct Answers', 
                   labels={'CorrectAnswer':'Correct Answer', 'MisconceptionAId':'Misconception A'})
fig.show()

**How Do Different Subjects Contribute to Different Constructs?**

In [None]:
# Plot showing how subjects contribute to various constructs
fig = px.sunburst(train_df, path=['SubjectName', 'ConstructName'], 
                  title='Contribution of Different Subjects to Different Constructs', 
                  labels={'SubjectName':'Subject', 'ConstructName':'Construct'})
fig.show()

**Which Misconceptions are Most Common Across All Questions?**

In [None]:
# Combining all misconception columns for frequency analysis
misconception_counts = pd.concat([train_df['MisconceptionAId'], train_df['MisconceptionBId'], train_df['MisconceptionCId'], train_df['MisconceptionDId']])

# Plotting the most common misconceptions
fig = px.histogram(misconception_counts, x=misconception_counts, 
                   title='Most Common Misconceptions Across All Questions', 
                   labels={'value':'Misconception ID'})
fig.show()

**How Does the Distribution of Answers Vary by Subject?**

In [None]:
# Plotting the distribution of correct answers by subject
fig = px.histogram(train_df, x='SubjectName', color='CorrectAnswer', 
                   title='Distribution of Answers by Subject', 
                   height=1000,
                   labels={'SubjectName':'Subject', 'CorrectAnswer':'Correct Answer'})
fig.update_xaxes(tickangle=90)
fig.show()

**What Misconception IDs are Linked to Specific Constructs?**

In [None]:
# Grouping misconceptions by construct
misconception_construct = train_df.groupby('ConstructName')[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].mean().reset_index()

# Plotting misconception IDs linked to constructs
fig = px.bar(misconception_construct.melt(id_vars='ConstructName', value_name='MisconceptionId'), 
             x='ConstructName', y='MisconceptionId', color='variable', 
             title='Misconception IDs Linked to Specific Constructs', 
             height=1500,
             labels={'ConstructName':'Construct', 'MisconceptionId':'Misconception ID'})
fig.update_xaxes(tickangle=90)
fig.show()

**Which Constructs are Associated with the Most Misconceptions?**

In [None]:
# Counting misconceptions per construct
misconception_per_construct = train_df.groupby('ConstructName')[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].count().sum(axis=1).reset_index(name='TotalMisconceptions')

# Plotting constructs associated with most misconceptions
fig = px.bar(misconception_per_construct, x='ConstructName', y='TotalMisconceptions', 
             title='Constructs Associated with the Most Misconceptions', 
             height=1500,
             labels={'ConstructName':'Construct', 'TotalMisconceptions':'Total Misconceptions'})
fig.update_xaxes(tickangle=90)
fig.show()

**How Are Different Constructs Associated with Misconceptions?**

In [None]:
# Preparing data for a stacked bar plot
construct_misconceptions = train_df.groupby('ConstructName').agg({
    'MisconceptionAId': 'count',
    'MisconceptionBId': 'count',
    'MisconceptionCId': 'count',
    'MisconceptionDId': 'count'
}).reset_index()

# Plotting the stacked bar plot
fig = px.bar(construct_misconceptions, x='ConstructName', y=['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'],
             title='Association of Different Constructs with Misconceptions',
             labels={'ConstructName': 'Construct', 'value': 'Count'},
             height=1500,
             text_auto=True)

# Updating the layout for better readability
fig.update_layout(barmode='stack')
fig.update_xaxes(tickangle=90)
fig.show()

**What is the Distribution of Misconceptions Across Different Answer Options?**

In [None]:
# Combining misconception columns into one DataFrame
misconception_distribution = pd.DataFrame({
    'AnswerOption': ['A', 'B', 'C', 'D'],
    'Count': [
        train_df['MisconceptionAId'].notnull().sum(),
        train_df['MisconceptionBId'].notnull().sum(),
        train_df['MisconceptionCId'].notnull().sum(),
        train_df['MisconceptionDId'].notnull().sum()
    ]
})

# Plotting the stacked bar plot for misconception distribution across answer options
fig = px.bar(misconception_distribution, x='AnswerOption', y='Count',
             title='Distribution of Misconceptions Across Different Answer Options',
             labels={'AnswerOption': 'Answer Option', 'Count': 'Count'},
             text_auto=True)

fig.show()

**What is the Proportion of Each Subject in the Dataset?**

In [None]:
# Counting the occurrences of each construct
construct_counts = train_df['ConstructName'].value_counts().reset_index()
construct_counts.columns = ['ConstructName', 'Count']

# Limiting to top 10 constructs for clarity
top_constructs = construct_counts.head(10)
other_constructs_count = construct_counts.tail(-10)['Count'].sum()
# Creating a DataFrame for the 'Other' category
other_constructs_df = pd.DataFrame({'ConstructName': ['Other'], 'Count': [other_constructs_count]})

# Concatenating the top constructs with 'Other'
top_constructs = pd.concat([top_constructs, other_constructs_df], ignore_index=True)

# Plotting the pie chart
fig = px.pie(top_constructs, names='ConstructName', values='Count', width=1500, height=1500,
             title='Proportion of Each Construct in the Dataset (Top 10 Constructs)')
fig.show()

**What is the Proportion of Each Subject in the Dataset?**

In [None]:
# Counting the occurrences of each subject
subject_counts = train_df['SubjectName'].value_counts().reset_index()
subject_counts.columns = ['SubjectName', 'Count']

# Limiting to top 10 subjects for clarity
top_subjects = subject_counts.head(10)
other_subjects_count = subject_counts.tail(-10)['Count'].sum()
# Creating a DataFrame for the 'Other' category
other_subjects_df = pd.DataFrame({'SubjectName': ['Other'], 'Count': [other_subjects_count]})

# Concatenating the top subjects with 'Other'
top_subjects = pd.concat([top_subjects, other_subjects_df], ignore_index=True)

# Plotting the pie chart
fig = px.pie(top_subjects, names='SubjectName', values='Count',
             title='Proportion of Each Subject in the Dataset (Top 10 Subjects)')
fig.show()


**What is the Proportion of Each Answer Option in the Dataset?**

In [None]:
# Counting the occurrences of each answer option
answer_option_counts = train_df['CorrectAnswer'].value_counts().reset_index()
answer_option_counts.columns = ['AnswerOption', 'Count']

# Plotting the pie chart for the proportion of each answer option
fig = px.pie(answer_option_counts, names='AnswerOption', values='Count', 
             title='Proportion of Each Answer Option in the Dataset')
fig.show()

**How Are Misconceptions Distributed Across Constructs?**

In [None]:
# Preparing data for a pie chart to show distribution of misconceptions per construct
misconception_per_construct = train_df[['ConstructName', 'MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']]
misconception_per_construct = misconception_per_construct.melt(id_vars='ConstructName', value_name='MisconceptionId')
misconception_counts_per_construct = misconception_per_construct.groupby('ConstructName').size().reset_index(name='TotalMisconceptions')

# Limiting to top 10 constructs for clarity
top_constructs_misconceptions = misconception_counts_per_construct.head(10)
other_constructs_misconceptions_count = misconception_counts_per_construct.tail(-10)['TotalMisconceptions'].sum()
# Creating a DataFrame for the 'Other' category
other_constructs_misconceptions_df = pd.DataFrame({'ConstructName': ['Other'], 'TotalMisconceptions': [other_constructs_misconceptions_count]})

# Concatenating the top constructs with 'Other'
top_constructs_misconceptions = pd.concat([top_constructs_misconceptions, other_constructs_misconceptions_df], ignore_index=True)

# Plotting the pie chart
fig = px.pie(top_constructs_misconceptions, names='ConstructName', values='TotalMisconceptions', width=1500, height=1500,
             title='Misconceptions Distribution Across Constructs (Top 10 Constructs)')
fig.show()

**How Does the Number of Misconceptions Relate to Each Construct?**

In [None]:
# Preparing data for scatter plot: Counting misconceptions per construct
misconceptions_per_construct = pd.concat([
    train_df[['MisconceptionAId', 'ConstructName']].rename(columns={'MisconceptionAId': 'MisconceptionId'}),
    train_df[['MisconceptionBId', 'ConstructName']].rename(columns={'MisconceptionBId': 'MisconceptionId'}),
    train_df[['MisconceptionCId', 'ConstructName']].rename(columns={'MisconceptionCId': 'MisconceptionId'}),
    train_df[['MisconceptionDId', 'ConstructName']].rename(columns={'MisconceptionDId': 'MisconceptionId'})
])

# Counting the number of misconceptions per construct
misconception_counts = misconceptions_per_construct.groupby('ConstructName').size().reset_index(name='TotalMisconceptions')

# Plotting the scatter plot
fig = px.scatter(misconception_counts, x='ConstructName', y='TotalMisconceptions', 
                 title='Number of Misconceptions per Construct', 
                 labels={'ConstructName': 'Construct Name', 'TotalMisconceptions': 'Total Misconceptions'},
                 height=1500,
                 text='TotalMisconceptions')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**Is There a Relationship Between the Length of Questions and the Number of Misconceptions?**

In [None]:
# Adding a column for the length of the question text
train_df['QuestionLength'] = train_df['QuestionText'].apply(len)

# Preparing data for scatter plot: Length of Questions vs. Number of Misconceptions
misconceptions_per_question = pd.concat([
    train_df[['QuestionLength', 'MisconceptionAId']].rename(columns={'MisconceptionAId': 'MisconceptionId'}),
    train_df[['QuestionLength', 'MisconceptionBId']].rename(columns={'MisconceptionBId': 'MisconceptionId'}),
    train_df[['QuestionLength', 'MisconceptionCId']].rename(columns={'MisconceptionCId': 'MisconceptionId'}),
    train_df[['QuestionLength', 'MisconceptionDId']].rename(columns={'MisconceptionDId': 'MisconceptionId'})
])

# Counting the number of misconceptions per question length
misconception_counts_per_length = misconceptions_per_question.groupby('QuestionLength').size().reset_index(name='TotalMisconceptions')

# Plotting the scatter plot
fig = px.scatter(misconception_counts_per_length, x='QuestionLength', y='TotalMisconceptions', 
                 title='Relationship Between Question Length and Number of Misconceptions', 
                 labels={'QuestionLength': 'Question Length', 'TotalMisconceptions': 'Total Misconceptions'},
                 height=1500,
                 text='TotalMisconceptions')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**What is the Relationship Between the Frequency of Each Answer Option and Its Misconceptions?**

In [None]:
# Counting the frequency of each answer option
answer_option_frequency = train_df['CorrectAnswer'].value_counts().reset_index()
answer_option_frequency.columns = ['AnswerOption', 'Frequency']

# Counting misconceptions for each answer option
misconceptions_per_answer_option = pd.concat([
    train_df[['CorrectAnswer', 'MisconceptionAId']].rename(columns={'MisconceptionAId': 'MisconceptionId'}),
    train_df[['CorrectAnswer', 'MisconceptionBId']].rename(columns={'MisconceptionBId': 'MisconceptionId'}),
    train_df[['CorrectAnswer', 'MisconceptionCId']].rename(columns={'MisconceptionCId': 'MisconceptionId'}),
    train_df[['CorrectAnswer', 'MisconceptionDId']].rename(columns={'MisconceptionDId': 'MisconceptionId'})
])

misconception_counts_per_answer_option = misconceptions_per_answer_option.groupby('CorrectAnswer').size().reset_index(name='TotalMisconceptions')
answer_option_summary = answer_option_frequency.merge(misconception_counts_per_answer_option, left_on='AnswerOption', right_on='CorrectAnswer')

# Plotting the scatter plot
fig = px.scatter(answer_option_summary, x='Frequency', y='TotalMisconceptions', 
                 title='Relationship Between Answer Option Frequency and Misconceptions', 
                 labels={'Frequency': 'Frequency of Answer Option', 'TotalMisconceptions': 'Total Misconceptions'},
                 text='TotalMisconceptions')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**Examining the Relationship Between Subject IDs and Misconceptions**

In [None]:
# Counting the number of misconceptions per subject ID
misconceptions_per_subject_id = pd.concat([
    train_df[['SubjectId', 'MisconceptionAId']].rename(columns={'MisconceptionAId': 'MisconceptionId'}),
    train_df[['SubjectId', 'MisconceptionBId']].rename(columns={'MisconceptionBId': 'MisconceptionId'}),
    train_df[['SubjectId', 'MisconceptionCId']].rename(columns={'MisconceptionCId': 'MisconceptionId'}),
    train_df[['SubjectId', 'MisconceptionDId']].rename(columns={'MisconceptionDId': 'MisconceptionId'})
])

# Counting the number of misconceptions per subject ID
misconception_counts_per_subject_id = misconceptions_per_subject_id.groupby('SubjectId').size().reset_index(name='TotalMisconceptions')

# Plotting the scatter plot
fig = px.scatter(misconception_counts_per_subject_id, x='SubjectId', y='TotalMisconceptions', 
                 title='Number of Misconceptions per Subject ID', 
                 labels={'SubjectId': 'Subject ID', 'TotalMisconceptions': 'Total Misconceptions'},
                 height=1500,
                 text='TotalMisconceptions')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**Analyzing the Distribution of Correct Answers Across Constructs**

In [None]:
# Counting the occurrences of each correct answer per construct
correct_answer_per_construct = train_df.groupby(['ConstructName', 'CorrectAnswer']).size().reset_index(name='Count')

# Plotting the scatter plot
fig = px.scatter(correct_answer_per_construct, x='ConstructName', y='Count', color='CorrectAnswer', 
                 title='Distribution of Correct Answers Across Constructs', 
                 labels={'ConstructName': 'Construct Name', 'Count': 'Count'},
                 height=1500,
                 text='Count')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**How Are Misconceptions Distributed Across Different Subjects?**

In [None]:
# Counting the number of misconceptions per subject
misconceptions_per_subject = pd.concat([
    train_df[['SubjectName', 'MisconceptionAId']].rename(columns={'MisconceptionAId': 'MisconceptionId'}),
    train_df[['SubjectName', 'MisconceptionBId']].rename(columns={'MisconceptionBId': 'MisconceptionId'}),
    train_df[['SubjectName', 'MisconceptionCId']].rename(columns={'MisconceptionCId': 'MisconceptionId'}),
    train_df[['SubjectName', 'MisconceptionDId']].rename(columns={'MisconceptionDId': 'MisconceptionId'})
])

# Counting the number of misconceptions per subject
misconception_counts_per_subject = misconceptions_per_subject.groupby('SubjectName').size().reset_index(name='TotalMisconceptions')

# Plotting the scatter plot
fig = px.scatter(misconception_counts_per_subject, x='SubjectName', y='TotalMisconceptions', 
                 title='Number of Misconceptions per Subject', 
                 labels={'SubjectName': 'Subject Name', 'TotalMisconceptions': 'Total Misconceptions'},
                 height=1500,
                 text='TotalMisconceptions')

# Adding text labels for better clarity
fig.update_traces(textposition='top center')

fig.show()

**What is the Proportion of Each Answer Option per Construct?**

In [None]:
# Preparing data for pie chart to show the proportion of each answer option per construct
answer_option_per_construct = train_df[['ConstructName', 'CorrectAnswer']]
answer_option_counts_per_construct = answer_option_per_construct.groupby('ConstructName')['CorrectAnswer'].value_counts().reset_index(name='Count')

# Plotting the pie chart for each construct's answer options distribution
for construct in answer_option_counts_per_construct['ConstructName'].unique():
    construct_data = answer_option_counts_per_construct[answer_option_counts_per_construct['ConstructName'] == construct]
    fig = px.pie(construct_data, names='CorrectAnswer', values='Count',
                 title=f'Proportion of Each Answer Option for Construct: {construct}')
    fig.show()


###### 