In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re

In [None]:
label_modularity = "NewClass"

In [None]:
train_filename = 'FinalDataset/polished_dataset_94_20000_limited_texts_down_2000.csv'
train_data = pd.read_csv(train_filename)

In [None]:
#Count labels and empty texts
num_labels = num_unique_labels = train_data[label_modularity].nunique()
print("NUM LABELS")
print(num_labels)
empty_text_count = (train_data['Text'] == '').sum()
print("EMPTY TEXTS")
print(empty_text_count)
num_rows = train_data.shape[0]
print("NUM TEXTS")
print(num_rows)
all_text = ' '.join(train_data['Text'].values)
num_words = len(all_text.split())
print("NUM WORDS")
print(num_words)

In [None]:
#Calculate Z
z = num_rows / (num_words/num_rows)
print(z)

In [None]:
#Count texts of each category
category_counts = train_data[label_modularity].value_counts()
print("Número de textos por categoría:")
pd.set_option('display.max_rows', None)
total_count = category_counts.sum()
print(category_counts)
pd.reset_option('display.max_rows')

In [None]:
#Plot text amounts
plt.figure(figsize=(14, 6))
category_counts.plot(kind='bar', color='skyblue')
plt.title('Texts per category')
plt.xlabel('Category')
plt.ylabel('Text amount')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Find the shortest and largest text
min_length = train_data['Text'].apply(len).min()
max_length = train_data['Text'].apply(len).max()
print(f"Longitud del texto más corto: {min_length} caracteres")
print(f"Longitud del texto más largo: {max_length} caracteres")

In [None]:
#Find the class with more text and the class with less

word_counts_by_category = {}

for index, row in train_data.iterrows():
    text = row['Text']
    category = row[label_modularity]
    
    words = re.findall(r'\w+', str(text).lower())

    word_count = len(words)
    
    if category in word_counts_by_category:
        word_counts_by_category[category] += word_count
    else:
        word_counts_by_category[category] = word_count

category_word_counts = [(category, total_words) for category, total_words in word_counts_by_category.items()]

category_word_counts_sorted = sorted(category_word_counts, key=lambda x: x[1], reverse=True)

for category, total_words in category_word_counts_sorted:
    print(f'Category: {category}, Total words: {total_words}')

In [None]:
suma = sum(word_counts_by_category.values())

print("The sum up of all words is:", suma)

In [None]:
#Plot text amounts

words = [item[0] for item in category_word_counts_sorted]
values = [item[1] for item in category_word_counts_sorted]

plt.figure(figsize=(14, 5))
plt.bar(words, values, color='skyblue')
plt.title('Words per category')
plt.xlabel('Category')
plt.ylabel('Word amount')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Average words per text for every class
train_data['word_count'] = train_data['Text'].apply(lambda x: len(x.split()))
mean_word_count_per_class = train_data.groupby('NewClass')['word_count'].mean()

plt.figure(figsize=(13, 6))
mean_word_count_per_class.plot(kind='bar', color='skyblue')
plt.title('Average words per tex for each class')
plt.xlabel('Class')
plt.ylabel('Average words per text')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()