In [None]:
import os
import numpy as np, gc 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
import matplotlib.pyplot as plt
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import textstat 


In [None]:
# this library is needed to assess readability of the essays
!pip install textstat

This notebook serves as a valuable resource for individuals at various proficiency levels in Natural Language Processing (NLP). We begin with an introductory section dedicated to general Exploratory Data Analysis (EDA). Subsequently, in the second section, we delve into a comprehensive content-based analysis of the AES 2.0 competition data. The content based analysis includes topic modelling, sentiment analysis, named entity recognition analysis, readability level of the essays and other common tasks. 

Throughout our investigation, we discovered a noteworthy observation regarding the memory-intensive nature of NLP tasks, particularly when dealing with nlp(essays_string). Due to memory constraints and the risk of potential memory leakage issues, our analysis was limited to processing only a quarter of the available dataset.

In [None]:
#loading train data and visulizing first fews records 
train_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")


In [None]:
#the essays object would be used frequently in the below sections 
essays = train_df['full_text']

**General EDA**

In [None]:
print("Train shape",train_df.shape)
train_df.head()

In [None]:
# Summary statistics of essay scores
print("Summary Statistics of Scores:")
print(train_df['score'].describe())

In [None]:
# Analyzing length of each assay
# Add a new column to store the length of each essay
train_df['essay_length'] = essays.apply(len)
# Display essay_id and essay_length columns
print(train_df[['essay_id', 'essay_length']])

In [None]:
#Analyzing maxim and minimum lengths of the essays 

# Find maximum essay length
max_length = train_df['essay_length'].max()

# Find minimum essay length
min_length = train_df['essay_length'].min()

print("Maximum essay length:", max_length)
print("Minimum essay length:", min_length)

In [None]:
#Visulizations for essay lengths 
train_df['essay_length'].value_counts().plot(kind='bar')
plt.title('Analyzing Essay Lengths')
plt.xlabel('Essay Length')
plt.ylabel('No. of Essays')
plt.show()

In [None]:
# Distribution plot of essay scores
plt.figure(figsize=(10, 6))
sns.histplot(train_df['score'], kde=True)
plt.title("Analyzing Essay Scores")
plt.xlabel("Essay Score")
plt.ylabel("No. of Essays")
plt.show()

In [None]:
# Correlation between essay length and scores
corr_matrix = train_df['score'].corr(train_df['essay_length'])
print("Correlation between Essay Length and Score:", corr_matrix)

In [None]:
# Visualizing correlation matrix
corr_matrix = train_df[['score', 'essay_length']].corr()
# Plotting the correlation matrix heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 12})
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Analyzing essay length vs. scores
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_df, x='essay_length', y='score')
plt.title("Essay Length vs. Essay Score Analysis")
plt.xlabel("Essay Length")
plt.ylabel("Essay Score")
plt.show()

**Essay contents EDA**

In [None]:
# Generating a single string by combine all essays 
all_essay_str = ' '.join(essays)
# Combine half of the essays into a single string for NER analysis
#This process nlp(selected_essays) needs much memory os we took only 1/4 essays to avoid from memory 
selected_essays = ' '.join(essays.iloc[:len(train_df) // 4])


In [None]:
# Download the necessary NLTK resources (if not already downloaded)
nltk.download('punkt')

In [None]:
# Download VADER lexicon (if not already downloaded)
nltk.download('vader_lexicon')

**Distribution of most common words**

In [None]:
# Tokenize the essays into words
no_of_words = all_essay_str.split()

# Count the frequency of each word
words_freq = Counter(no_of_words)

# Extract the top 20 common words and their frequencies
top_words = words_freq.most_common(20)

# Plot the word frequency distribution
plt.figure(figsize=(10, 6))
plt.bar([word[0] for word in top_words], [count[1] for count in top_words])
plt.title('Top 20 Common Words in Essays')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


**Analyzing the distribution of sentence lengths in the essays.**

In [None]:
# Generating sentences from the essays
sentens = []
for essay in essays:
    sentens.extend(nltk.sent_tokenize(essay))

# computing length of each sentence
senten_lens = [len(nltk.word_tokenize(sentence)) for sentence in sentens]

# Graphical view of the distribution of sentence lengths
plt.figure(figsize=(10, 6))
plt.hist(senten_lens, bins=30, color='skyblue', edgecolor='black')
plt.title('Investigating Sentence Lengths in Essays')
plt.xlabel('Sentence Length')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


**Topic Modeling: Analyzing latent topics in the essays using Latent Dirichlet Allocation (LDA) or Non-Negative Matrix Factorization (NMF)**  

In [None]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Create document-term matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(essays)

# LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda.fit_transform(tfidf_matrix)

# NMF
nmf = NMF(n_components=5, random_state=42)
nmf_topics = nmf.fit_transform(tfidf_matrix)

# Print the top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print("LDA Topics:")
print_top_words(lda, tfidf_vectorizer.get_feature_names_out(), 10)

print("NMF Topics:")
print_top_words(nmf, tfidf_vectorizer.get_feature_names_out(), 10)


**Sentiment analysis**

In [None]:
# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Analyze the sentiment of each essay
sentiment_scores = []
for essay in essays:
    sentiment_scores.append(sid.polarity_scores(essay)['compound'])

# Add sentiment scores to the dataframe
train_df['sentiment_score'] = sentiment_scores

# Overall sentiment analysis
positive_count = sum(score > 0 for score in sentiment_scores)
negative_count = sum(score < 0 for score in sentiment_scores)
neutral_count = len(sentiment_scores) - positive_count - negative_count

print("Sentiment Analysis:")
print("Positive essays:", positive_count)
print("Negative essays:", negative_count)
print("Neutral essays:", neutral_count)

# Data for visualization
categories = ['Positive', 'Negative', 'Neutral']
counts = [positive_count, negative_count, neutral_count]

# Create bar plot
plt.figure(figsize=(8, 6))
plt.bar(categories, counts, color=['green', 'red', 'blue'])
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Number of Essays')
plt.show()


**Named Entity Recognition (NER) Analysis**

In [None]:
nlp = spacy.load("en_core_web_sm")

# Increasing max_length limit
nlp.max_length = len(selected_essays)  

# Processing the combined text with spaCy
doc = nlp(selected_essays)

# Extracting named entities and count their occurrences
named_entities = Counter([(ent.text, ent.label_) for ent in doc.ents])

# Print the most common named entities and their counts
print("Most Common Named Entities:")
for entity, count in named_entities.most_common(10):
    print(f"{entity[0]} ({entity[1]}): {count}")


**Readability analysis**

**Assessing the readability level of the essays (Flesch-Kincaid, Gunning Fog).**

In [None]:
# Analyzing readability scores for each essay
readability_scores = []
for essay in essays:
    # Compute Flesch-Kincaid Grade Level
    fkg_score = textstat.flesch_kincaid_grade(essay)
    
    # Computing Gunning Fog Index
    gunning_fog_score = textstat.gunning_fog(essay)
    
    # Add scores to list
    readability_scores.append({'Flesch-Kincaid': fkg_score, 'Gunning Fog': gunning_fog_score})

# Creating a dataframe to store the scores
readability_df = pd.DataFrame(readability_scores)

# Create histograms for Flesch-Kincaid and Gunning Fog scores
plt.figure(figsize=(10, 6))
plt.hist(readability_df['Flesch-Kincaid'], bins=20, alpha=0.5, label='Flesch-Kincaid')
plt.hist(readability_df['Gunning Fog'], bins=20, alpha=0.5, label='Gunning Fog')
plt.title('Readability Scores Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()
# Display the dataframe
print(readability_df)
