# Aim of competition : 

* Predict the affinity between misconceptions and incorrect answers (distractors) in multiple-choice questions

* Before start, please use ml canvas to know the directions of the project 

# Install Library Gensim

In [None]:
%%time

!pip install gensim

# Load Files

In [None]:
%%time

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
misconception = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')
sample_submission = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv')

# Define Columns

In [None]:
%%time

# Defining the columns to keep for each dataset
train_columns = ['QuestionId', 'QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
test_columns = ['QuestionId', 'QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']

# Dropping other columns from train and test datasets
train_filtered = train[train_columns]
test_filtered = test[test_columns]

train_filtered
test_filtered

# Check Missing Values

In [None]:
%%time

# Checking for missing values in the combined dataset
missing_values = train_filtered.isnull().sum()

missing_values

# Clean Text Except Formula / Format Mathematics

In [None]:
%%time

import re

# Function to clean text but keep mathematical symbols and formatting
def clean_text(text):
    # Remove unnecessary symbols but retain mathematical expressions
    cleaned_text = re.sub(r'[^A-Za-z0-9\s\(\)\[\]\+\-\*/\\]', '', text)
    return cleaned_text

# Apply this function to the Answer and Question text columns
train_filtered['QuestionText_Clean'] = train_filtered['QuestionText'].apply(lambda x: clean_text(x))
train_filtered['AnswerAText_Clean'] = train_filtered['AnswerAText'].apply(lambda x: clean_text(x))
train_filtered['AnswerBText_Clean'] = train_filtered['AnswerBText'].apply(lambda x: clean_text(x))
train_filtered['AnswerCText_Clean'] = train_filtered['AnswerCText'].apply(lambda x: clean_text(x))
train_filtered['AnswerDText_Clean'] = train_filtered['AnswerDText'].apply(lambda x: clean_text(x))

train_filtered[['QuestionText', 'QuestionText_Clean', 'AnswerAText_Clean', 'AnswerBText_Clean', 'AnswerCText_Clean', 'AnswerDText_Clean']].head()

# Preprocessing Text

In [None]:
%%time

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import string

# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
%%time

# Download necessary resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
%%time

# Define stopwords and punctuation to remove
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

In [None]:
%%time

# Combined function to clean and preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove punctuation and stop words, and lemmatize the words
    cleaned_words = [
        lemmatizer.lemmatize(word) 
        for word in words 
        if word not in stop_words and word not in punctuation
    ]
    
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

# Applying this function to a column in the dataframe
train_filtered['cleaned_QuestionText'] = train_filtered['QuestionText'].apply(preprocess_text)
train_filtered['cleaned_AnswerAText'] = train_filtered['AnswerAText'].apply(preprocess_text)
train_filtered['cleaned_AnswerBText'] = train_filtered['AnswerBText'].apply(preprocess_text)
train_filtered['cleaned_AnswerCText'] = train_filtered['AnswerCText'].apply(preprocess_text)
train_filtered['cleaned_AnswerDText'] = train_filtered['AnswerDText'].apply(preprocess_text)

train_filtered[['QuestionText', 'cleaned_QuestionText', 'cleaned_AnswerAText', 'cleaned_AnswerBText', 'cleaned_AnswerCText', 'cleaned_AnswerDText']].head()

# Encode Categorical Columns

In [None]:
%%time

# Using label encoding for the categorical columns 
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Identify categorical columns in train and test datasets
categorical_columns_train = train_filtered.select_dtypes(include=['object']).columns
categorical_columns_test = test_filtered.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding for categorical columns in train and test datasets
train_encoded = pd.get_dummies(train_filtered, columns=categorical_columns_train, drop_first=True)
test_encoded = pd.get_dummies(test_filtered, columns=categorical_columns_test, drop_first=True)

# Align columns by adding missing columns to the test or train dataset
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

train_encoded.head(), test_encoded.head()

# Visualize WordCloud

In [None]:
%%time

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Create the corpus using the original 'QuestionText' column after basic cleaning
corpus = ' '.join(train_filtered['QuestionText'].astype(str).tolist())

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(corpus)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Question Text')
plt.show()

# Count Vocab

In [None]:
%%time

# Count the vocabulary (unique words) in the corpus

words = corpus.split()
vocabulary_count = len(set(words))

vocabulary_count

# Visualize Most Common Words

In [None]:
%%time

from collections import Counter
import pandas as pd

# Split the corpus into individual words
words = corpus.split()

# Count the frequency of each word
word_counts = Counter(words)

# Create a DataFrame with the most common words
common_words_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

# Plot the most common words
plt.figure(figsize=(10, 6))
common_words_df.plot(kind='bar', x='Word', y='Frequency', legend=False)
plt.title('Most Common Words in Question Text')
plt.ylabel('Frequency')
plt.xlabel('Words')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Perform LDA Topic Modelling

In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Prepare the text data for LDA by vectorizing the corpus
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(train_filtered['QuestionText'].astype(str))

# Perform LDA to find 5 topics
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Get the top words for each topic
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()

topics = []
for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = [feature_names[i] for i in top_features_ind]
    topics.append(f"Topic {topic_idx+1}: {' '.join(top_features)}")

topics

# Create BoW & TF-IDF

In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 'QuestionText' as the text column to convert corpus
corpus = train_filtered['QuestionText'].astype(str).tolist()

# Bag of Words (BoW) conversion
vectorizer_bow = CountVectorizer(max_features=1000)  # You can limit the number of features if needed
X_bow = vectorizer_bow.fit_transform(corpus)

# TF-IDF conversion
vectorizer_tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer_tfidf.fit_transform(corpus)

# Display the shape of the resulting matrices
X_bow.shape, X_tfidf.shape

# Create Tokenized & Vectors

In [None]:
%%time

from gensim.models import Word2Vec

# Tokenize the text 
tokenized_corpus = [question.split() for question in corpus]

# Train a Word2Vec model on the tokenized text
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get the vector for a word
word_vector = word2vec_model.wv['What']

word_vector

In [None]:
%%time

terms = ['triangle', 'rectangle', 'function', 'equation', 'graph']

# Get word vectors for each term in the list
word_vectors = {term: word2vec_model.wv[term] for term in terms if term in word2vec_model.wv}

word_vectors

In [None]:
%%time

# Convert the entire text into vectors (average word vectors for each document)
document_vectors = []
for tokens in tokenized_corpus:
    vector = sum([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]) / len(tokens)
    document_vectors.append(vector)

# K-Means Clustering

In [None]:
%%time

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Normalize the data using StandardScaler, setting with_mean=False for sparse data
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

# Apply K-Means with an arbitrary number of clusters (e.g., 5 clusters)
kmeans = KMeans(n_clusters=5, random_state=42)
train_filtered['Cluster'] = kmeans.fit_predict(X_scaled)

train_filtered[['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText', 'Cluster']].head()

# Visualize K-Means Clustering For Answer

In [None]:
%%time

from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# Reduce the dimensionality of the data to 2D for visualization using TruncatedSVD
svd = TruncatedSVD(n_components=2)
X_svd = svd.fit_transform(X_scaled)

# Plot the clusters
plt.scatter(X_svd[:, 0], X_svd[:, 1], c=train_filtered['Cluster'], cmap='viridis', marker='o')
plt.title('K-Means Clustering of Answer Texts')
plt.xlabel('SVD Component 1')
plt.ylabel('SVD Component 2')
plt.show()

# The most similar questions

In [None]:
%%time

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Create a TF-IDF vectorizer to convert 'QuestionText' into numerical vectors
vectorizer = TfidfVectorizer()

# Fit and transform the 'QuestionText' column to get the TF-IDF matrix
question_vectors = vectorizer.fit_transform(train_filtered['QuestionText'].values)

# Find the vector for the first question (query vector)
query_vector = question_vectors[0]

# Compute cosine similarities between the query and all other questions
cosine_similarities = cosine_similarity(query_vector, question_vectors)

# Get the indices of the most similar questions
most_similar_indices = cosine_similarities[0].argsort()[::-1][1:6]  # Top 5 most similar

# The most similar questions 
most_similar_questions = train_filtered.iloc[most_similar_indices][['QuestionText']]
most_similar_questions

# The most similar answers

In [None]:
%%time

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Create a TF-IDF vectorizer to convert text into numerical vectors
vectorizer = TfidfVectorizer()

# Fit and transform the 'AnswerAText' column to get the TF-IDF matrix
answer_vectors = vectorizer.fit_transform(train_filtered['AnswerAText'].values)

# Find the vector for the first answer (query vector)
query_vector = answer_vectors[0]

# Compute cosine similarities between the query and all other answers
cosine_similarities = cosine_similarity(query_vector, answer_vectors)

# Get the indices of the most similar answers
most_similar_indices = cosine_similarities[0].argsort()[::-1][1:6]  # Top 5 most similar

# The most similar answers
most_similar_answers = train_filtered.iloc[most_similar_indices][['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']]
most_similar_answers