In [3]:
# Load Stemmed data and non stemmed data

import pandas as pd

stemmed = pd.read_csv("./data/survey_clean_stemmed.csv")
non_stemmed = pd.read_csv("./data/survey_clean.csv")

# Count the number of words in each response in both datasets
stemmed["word_count"] = stemmed["stemmed_message"].str.split().str.len()
non_stemmed["word_count"] = non_stemmed["processed_message"].str.split().str.len()

# Count the number of characters in each response in both datasets
stemmed["char_count"] = stemmed["stemmed_message"].str.len()
non_stemmed["char_count"] = non_stemmed["processed_message"].str.len()

# Sum the number of words and characters in each dataset
stemmed_word_count = stemmed["word_count"].sum()
non_stemmed_word_count = non_stemmed["word_count"].sum()
stemmed_char_count = stemmed["char_count"].sum()
non_stemmed_char_count = non_stemmed["char_count"].sum()

# Tabulate the results
print("Stemmed word count: ", stemmed_word_count)
print("Non stemmed word count: ", non_stemmed_word_count)
print("Stemmed char count: ", stemmed_char_count)
print("Non stemmed char count: ", non_stemmed_char_count)

# Calculate the percentage of words and characters that were removed by stemming
word_reduction = (non_stemmed_word_count - stemmed_word_count) / non_stemmed_word_count
char_reduction = (non_stemmed_char_count - stemmed_char_count) / non_stemmed_char_count

# Tabulate the results
print("Word reduction: ", word_reduction)
print("Char reduction: ", char_reduction)

# Calculate the average word length in each dataset
stemmed["avg_word_length"] = stemmed["char_count"] / stemmed["word_count"]
non_stemmed["avg_word_length"] = non_stemmed["char_count"] / non_stemmed["word_count"]

# Print the average word length in each dataset
print("Stemmed average word length: ", stemmed["avg_word_length"].mean())
print("Non stemmed average word length: ", non_stemmed["avg_word_length"].mean())

# Calculate the percentage of words that are unique in each dataset
stemmed_unique_words = stemmed["stemmed_message"].unique().size
non_stemmed_unique_words = non_stemmed["processed_message"].unique().size
stemmed_unique_word_percentage = stemmed_unique_words / stemmed_word_count
non_stemmed_unique_word_percentage = non_stemmed_unique_words / non_stemmed_word_count

# Tabulate the results
print("Stemmed unique words: ", stemmed_unique_words)
print("Non stemmed unique words: ", non_stemmed_unique_words)
print("Stemmed unique word percentage: ", stemmed_unique_word_percentage)
print("Non stemmed unique word percentage: ", non_stemmed_unique_word_percentage)

from tabulate import tabulate

# Create a DataFrame to hold the results
results = pd.DataFrame(
    {
        "Metric": [
            "Word count",
            "Char count",
            "Word reduction",
            "Char reduction",
            "Average word length",
            "Unique words",
            "Unique word percentage",
        ],
        "Stemmed": [
            stemmed_word_count,
            stemmed_char_count,
            "{:.1%}".format(word_reduction),
            "{:.1%}".format(char_reduction),
            "{:.1f}".format(stemmed["avg_word_length"].mean()),
            stemmed_unique_words,
            "{:.1%}".format(stemmed_unique_word_percentage),
        ],
        "Non-stemmed": [
            non_stemmed_word_count,
            non_stemmed_char_count,
            "{:.1%}".format(word_reduction),
            "{:.1%}".format(char_reduction),
            "{:.1f}".format(non_stemmed["avg_word_length"].mean()),
            non_stemmed_unique_words,
            "{:.1%}".format(non_stemmed_unique_word_percentage),
        ],
    }
)

# Print the DataFrame in a nice table format
print(tabulate(results, headers="keys", tablefmt="github"))

Stemmed word count:  10575
Non stemmed word count:  10574
Stemmed char count:  68326
Non stemmed char count:  78969
Word reduction:  -9.457159069415548e-05
Char reduction:  0.13477440514632325
Stemmed average word length:  6.570828953221212
Non stemmed average word length:  7.586457847538221
Stemmed unique words:  629
Non stemmed unique words:  629
Stemmed unique word percentage:  0.059479905437352244
Non stemmed unique word percentage:  0.0594855305466238
|    | Metric                 | Stemmed   | Non-stemmed   |
|----|------------------------|-----------|---------------|
|  0 | Word count             | 10575     | 10574         |
|  1 | Char count             | 68326     | 78969         |
|  2 | Word reduction         | -0.0%     | -0.0%         |
|  3 | Char reduction         | 13.5%     | 13.5%         |
|  4 | Average word length    | 6.6       | 7.6           |
|  5 | Unique words           | 629       | 629           |
|  6 | Unique word percentage | 5.9%      | 5.9%          |