In [145]:
import nltk
from nltk import FreqDist, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

In [146]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [147]:
#1. The length (in words)
def getLength(corpus):
    return len(corpus)

In [148]:
#2. The lexical diversity
def getLexicalDiversity(corpus):
    #case normalization
    return len(set(w.lower() for w in corpus if w.isalpha())) / len(corpus)

In [149]:
#3. Top 10 most frequent words and their counts
def getTop10(corpus):
    #case normalization
    sorted = [w.lower() for w in corpus if w.isalpha()]
    
    # Frequency distribution
    fdist = FreqDist(sorted)
    
    return fdist.most_common(10)

In [150]:
# Top 10 most frequent words and their counts (with more than 3 characters)
def getTop10W3(corpus):
    # Case normalization and more than 3 characters
    filtered_words = [w.lower() for w in corpus if w.isalpha() and len(w) > 3]
    
    # Frequency distribution
    fdist = FreqDist(filtered_words)

    return fdist.most_common(10)

In [151]:
# Top 10 most frequent words and their counts, excluding stopwords
"""
stopWords = {'is', 'that', 'to', 'the', 'and', 'of', 'a', 'in', 'it', 'for', 'on', 'with', 'as', 'by', 'at', 'this', 'an', 'be'}
def getTop10(corpus):
    # Case normalization and stopwords
    filtered_words = [w.lower() for w in corpus if w.isalpha() and w.lower() not in stopWords]
    
    # Frequency distribution
    fdist = FreqDist(filtered_words)
    
    return fdist.most_common(10)
    """
# just knowing nltk have stopword liberay 
def getTop10StopWord(corpus):
    # Case normalization and stopwords
    filtered_words = [w.lower() for w in corpus if w.isalpha() and w.lower() not in stopWords]
    
    # Frequency distribution
    fdist = FreqDist(filtered_words)
    
    return fdist.most_common(10)

In [152]:
#4. Words that are at least 10 characters long and their counts
def getLongWords(corpus):
    #case normalization that are at least 10 characters
    longWords = [w.lower() for w in corpus if len(w) >= 10 and w.isalpha()]
    longWordsFdist = FreqDist(longWords)
    return longWordsFdist.most_common()

In [153]:
#5. The longest sentence (type the sentence and give the number of words)
def getLongestSentence(corpus):
    #sentance tokenize the whole corpus
    sentences = sent_tokenize(" ".join(corpus))
    #comparing lengths
    longestSentence = max(sentences, key=lambda s: len(s.split()))
    return len(longestSentence.split()), longestSentence 

In [154]:
#6. A stemmed version of the longest sentence
# porter stemmer did work for certen words 
"""
def getStemmed(sentence):
    ps = PorterStemmer()  
    # Tokenize
    stemmed_sentence = [ps.stem(w) for w in word_tokenize(sentence.lower())]
    # re-string
    return " ".join(stemmed_sentence)
   """

'\ndef getStemmed(sentence):\n    ps = PorterStemmer()  \n    # Tokenize\n    stemmed_sentence = [ps.stem(w) for w in word_tokenize(sentence.lower())]\n    # re-string\n    return " ".join(stemmed_sentence)\n   '

In [155]:
#6. A stemmed version of the longest sentence
# snowball stemmer 
"""
def getStemmed(sentence):
    ss = SnowballStemmer("english") 
    # Tokenize normalization
    stemmed_sentence = [ss.stem(w) for w in word_tokenize(sentence.lower())]
    # re-string
    return " ".join(stemmed_sentence)
    """

'\ndef getStemmed(sentence):\n    ss = SnowballStemmer("english") \n    # Tokenize normalization\n    stemmed_sentence = [ss.stem(w) for w in word_tokenize(sentence.lower())]\n    # re-string\n    return " ".join(stemmed_sentence)\n    '

In [156]:
#6. A stemmed version of the longest sentence
# there are still some problem with lancaster stemmer but accuracy is increased

def getStemmed(sentence):
    st = LancasterStemmer()  
    # Tokenize normalization
    stemmed_sentence = [st.stem(w) for w in word_tokenize(sentence.lower())]
    # re-string
    return " ".join(stemmed_sentence)


In [164]:
with open('tester3.txt', 'r', encoding='utf-8') as file:
    content = file.read()

tokens = word_tokenize(content)

# Get functions
length = getLength(tokens)
lexical_diversity = getLexicalDiversity(tokens)
top_10 = getTop10(tokens)
top_10_w3 = getTop10W3(tokens)
top_10_stopwords = getTop10StopWord(tokens)
long_words = getLongWords(tokens)
longest_sentence_length, longest_sentence = getLongestSentence(tokens)
stemmed_sentence = getStemmed(longest_sentence)

# Write results to txt
with open('output3.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(f"1. Length (in words): {length}\n")
    output_file.write(f"2. Lexical Diversity: {lexical_diversity}\n")
    output_file.write("3. Top 10 Most Frequent Words:\n")
    for word, count in top_10:
        output_file.write(f"   {word}: {count}\n")
    
    output_file.write("5. Top 10 Most Frequent Words Excluding Stopwords:\n")
    for word, count in top_10_stopwords:
        output_file.write(f"   {word}: {count}\n")
    
    output_file.write("6. Words that are at Least 10 Characters Long:\n")
    for word, count in long_words:
        output_file.write(f"   {word}: {count}\n")
    
    output_file.write(f"7. Longest Sentence Length: {longest_sentence_length}\n")
    output_file.write(f"   Longest Sentence: \"{longest_sentence}\"\n")
    output_file.write(f"8. Stemmed Version of the Longest Sentence:\n")
    output_file.write(f"   \"{stemmed_sentence}\"\n")

print("done")

done
