<a href="https://colab.research.google.com/github/sundaybest3/sundaybest3/blob/main/Corpus/NLTK_FreqList.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🌱 **Making a word list using {nltk} library**

[sample text](https://read.gov/aesop/001.html)

In [None]:
text = input()

In [None]:
# Example text

text = "This is an example sentence to test the removal of stop words and calculate frequency including 123 and '!'."

## [1] Pre-processing

In [None]:
# Step 1: Import necessary libraries
# check nltk stopwords list at google.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize #sentence_tokenize is also possible
from nltk.probability import FreqDist # frequency distribution
import re # regular expression

In [None]:
# Step 2: Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Step 3: Prepare the stop words
stop_words = set(stopwords.words('english'))

# Add more custom stop words
additional_stop_words = [ ]
stop_words.update(additional_stop_words)

In [None]:
# Step 4: Clean the text
# Use regular expressions to remove punctuation and numbers
clean_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
clean_text = re.sub(r'\d+', '', clean_text)  # Remove numbers

In [None]:
clean_text

text > clean_text (after removing stop words, numbers, and punctuation)

In [None]:
# Step 5: Tokenize the text and remove stop words
words = word_tokenize(clean_text)
filtered_words = [word.lower() for word in words if word.lower() not in stop_words]


In [None]:
words

## [2] Frequency distribution list

In [None]:
# Step 6: Create frequency distribution
freq_dist = FreqDist(filtered_words)

sorting by high frequency

In [None]:
# Step 7B: Convert frequency distribution to a sorted list of tuples (word, frequency)

sorted_freq_list = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True) # 많은 것에서 작은 것으로

# Display the sorted frequency list
for word, frequency in sorted_freq_list:
    print(f'{word}: {frequency}')


## [3] To dataframe and csv file

In [None]:
# Step 8: Convert frequency distribution to a DataFrame and a csv file

import pandas as pd

df = pd.DataFrame(sorted_freq_list, columns=['Word', 'Frequency'])

In [None]:
# Step 9: Save the DataFrame to a CSV file
csv_file_path = '/content/word_frequencies.csv'
df.to_csv(csv_file_path, index=False)
#df.to_csv(csv_file_path) # save the file with index (note that it starts from 0)

print(f"Frequency list saved to CSV file at: {csv_file_path}")

## [4] Check the result

In [None]:
df.head()

---
# The end