# Analysis of the Wikipedia dataset

Goal: Gather insights into the dataset

## Load data

In [None]:
import sqlite3

import nltk
import pandas as pd
from environment.env import getDataSourcePath


connection = sqlite3.connect(getDataSourcePath())
cursor = connection.cursor()

query = "SELECT * FROM wikipedia_sections"
cursor.execute(query)

data = cursor.fetchall()
connection.close()


## Analyze dataset

In [None]:
df = pd.DataFrame(data, columns=['sectionID', 'articleID', 'articleHeadline', 'rawArticleText', 'cleanedArticleText', 'sectionHeadline', 'isFirstSection'])

print("First 5 lines of dataset")
print(df.head())

In [None]:
df['Text_Length'] = df['cleanedArticleText'].apply(len)

print("\nStatistics to text length:")
print(df['Text_Length'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df['Text_Length'], bins=30, kde=True)
plt.title('Distribution of text length')
plt.xlabel('Text length')
plt.ylabel('Frequency')
plt.show()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
df['Tokenized_Text'] = df['cleanedArticleText'].apply(lambda x: word_tokenize(x))
df['Tokenized_Text'] = df['Tokenized_Text'].apply(lambda x: [word.lower() for word in x if word.isalpha() and word.lower() not in stop_words])

all_words = [word for tokens in df['Tokenized_Text'] for word in tokens]
fdist = FreqDist(all_words)
plt.figure(figsize=(12, 6))
plt.title('Top 30 words')
fdist.plot(30, cumulative=False)
plt.show()

In [None]:
df['Word_Count'] = df['Tokenized_Text'].apply(len)

plt.figure(figsize=(10, 6))
sns.histplot(df['Word_Count'], bins=20, kde=True)
plt.title('Distribution of Word Count in Text')
plt.xlabel('Number of words')
plt.ylabel('Frequency')
plt.show()

In [None]:
df['Sentence_Count'] = df['cleanedArticleText'].apply(lambda x: len(nltk.sent_tokenize(x)))

plt.figure(figsize=(10, 6))
sns.histplot(df['Sentence_Count'], bins=20, kde=True)
plt.title('Distribution of Sentence Count in Text')
plt.xlabel('Number of Sentences')
plt.ylabel('Frequency')
plt.show()

In [None]:
numeric_columns = ['Text_Length', 'Word_Count', 'Sentence_Count']
corr_matrix = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()