In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# files
!ls -l '../input/eedi-mining-misconceptions-in-mathematics/'

In [None]:
# configs
pd.set_option('display.max_columns', None) # we want to display all columns in this notebook
pd.set_option('display.max_rows', None) # same for rows
pd.set_option('display.max_colwidth', None) # show full text of each cell

# aesthetics
default_color_1 = 'darkblue'
default_color_2 = 'darkgreen'
default_color_3 = 'darkred'

In [None]:
# load data
df_train = pd.read_csv('../input/eedi-mining-misconceptions-in-mathematics/train.csv')
df_test = pd.read_csv('../input/eedi-mining-misconceptions-in-mathematics/test.csv')
df_map = pd.read_csv('../input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

# Basic EDA

In [None]:
# preview
df_train.head()

In [None]:
# structure
df_train.info()

In [None]:
# construct name - top 20
df_train.ConstructName.value_counts()[0:20]

In [None]:
# subject name
df_train.SubjectName.value_counts()

In [None]:
# example of all questions for a given subject
my_subject = 'Properties of Polygons'
df_train[df_train.SubjectName==my_subject]

# Distribution of correct answers

In [None]:
# distribution of answers
tab = df_train.CorrectAnswer.value_counts().sort_index()
print(tab)
tab.plot(kind='bar', color=default_color_1)
plt.title('CorrectAnswer')
plt.grid()
plt.show()

# Answer distributions by subject

In [None]:
# answer distribution by subject
tab_answer_by_subject = pd.crosstab(df_train.SubjectName, df_train.CorrectAnswer)
tab_answer_by_subject

# Most frequent misconceptions

In [None]:
# combine all misconceptions
miscon_all = pd.concat([df_train['MisconceptionAId'],
                        df_train['MisconceptionBId'],
                        df_train['MisconceptionCId'],
                        df_train['MisconceptionDId']])
# remove missings
miscon_all = miscon_all.dropna()
# convert to object
miscon_all = miscon_all.astype(object)
# count frequencies
counter = pd.DataFrame(miscon_all.value_counts())
counter.rename(columns={'count' : 'Frequency'}, inplace=True)
counter['MisconceptionId'] = counter.index
# add descriptions from mapping table
counter = pd.merge(left=counter, right=df_map, on='MisconceptionId')

In [None]:
# show top 20 misconceptions
counter[0:20]