# File Handling
# Day 19: 30DaysOfPython Challenge

### Exercises: Level 1

In [21]:
# Write a function which count number of lines and number of words in a text:
# a) Read obama_speech.txt file and count number of lines and words.

def line_count_word_count(textname):
    line_count = 0
    word_count = 0
    with open(textname, 'r') as txt:
        for line in txt:
            line_count += 1
            word_count += len(line.split())
        print(f"Number of lines: {line_count}")
        print(f"Number of words: {word_count}")

line_count_word_count('obama_speech.txt')

Number of lines: 66
Number of words: 2400


In [22]:
# b) Read michelle_obama_speech.txt file and count number of lines and words

line_count_word_count('michelle_obama_speech.txt')

Number of lines: 83
Number of words: 2204


In [23]:
# c) Read donald_speech.txt file and count number of lines and words

line_count_word_count('donald_speech.txt')

Number of lines: 48
Number of words: 1259


In [24]:
# d) Read melina_trump_speech.txt file and count number of lines and words

line_count_word_count('melina_trump_speech.txt')

Number of lines: 33
Number of words: 1375


In [18]:
import json

def most_populated_countries(filename, n):
    with open(filename, 'r', encoding='utf-8') as file:
        countries_data = json.load(file)

    sorted_countries = sorted(countries_data, key=lambda x: x['population'], reverse=True)

    # Take the top n countries
    top_countries = sorted_countries[:n]

    # Create a list of dictionaries with 'country' and 'population'
    result = [{'country': country['name'], 'population': country['population']} for country in top_countries]

    return result

filename = './countries_data.json'

result = most_populated_countries(filename, 10)
print(result)

[{'country': 'China', 'population': 1377422166}, {'country': 'India', 'population': 1295210000}, {'country': 'United States of America', 'population': 323947000}, {'country': 'Indonesia', 'population': 258705000}, {'country': 'Brazil', 'population': 206135893}, {'country': 'Pakistan', 'population': 194125062}, {'country': 'Nigeria', 'population': 186988000}, {'country': 'Bangladesh', 'population': 161006790}, {'country': 'Russian Federation', 'population': 146599183}, {'country': 'Japan', 'population': 126960000}]


### Exercises: Level 2

In [25]:
"""Find the most common words in the English language. Call the name of your function find_most_common_words, 
it will take two parameters - a string or a file and a positive integer, indicating the number of words.
Your function will return an array of tuples in descending order.
"""
import re

def find_most_common_words(file_path, num_words):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        
    words = re.findall(r'\b\w+\b', text.lower())

    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1

    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    most_common = sorted_words[:num_words]

    return most_common
result = find_most_common_words('sample.txt', 10)
print(result)

[('brown', 5), ('the', 4), ('fox', 4), ('dog', 4), ('a', 4), ('quick', 3), ('and', 3), ('jumps', 2), ('over', 2), ('lazy', 2)]


In [27]:
""""
Use the function, find_most_frequent_words to find: 
a) The ten most frequent words used in Obama's speech
"""

obama_result = find_most_common_words('obama_speech.txt', 10)
print(obama_result)

[('the', 129), ('and', 113), ('of', 81), ('to', 70), ('our', 67), ('we', 62), ('that', 50), ('a', 48), ('is', 36), ('in', 25)]


In [28]:
# b) The ten most frequent words used in Michelle's speech
michelle_result = find_most_common_words('michelle_obama_speech.txt', 10)
print(michelle_result)

[('and', 96), ('the', 85), ('to', 84), ('that', 50), ('of', 46), ('a', 41), ('he', 37), ('in', 36), ('my', 28), ('i', 28)]


In [31]:
# c) The ten most frequent words used in Trump's speech 
donald_result = find_most_common_words('donald_speech.txt', 10)
print(donald_result)

[('the', 65), ('and', 59), ('we', 44), ('will', 40), ('of', 38), ('to', 32), ('our', 30), ('is', 20), ('america', 17), ('for', 13)]


In [30]:
# d) The ten most frequent words used in Melina's speech
melina_result = find_most_common_words('melina_trump_speech.txt', 10)
print(melina_result)

[('and', 77), ('to', 55), ('the', 52), ('is', 29), ('i', 28), ('for', 27), ('of', 25), ('that', 24), ('a', 22), ('you', 21)]


In [32]:
"""
Write a python application that checks similarity between two texts.
It takes a file or a string as a parameter and it will evaluate the similarity of the two texts.
For instance check the similarity between the transcripts of Michelle's and Melina's speech.
You may need a couple of functions, function to clean the text(clean_text),
function to remove support words(remove_support_words) and finally to check the similarity(check_text_similarity).
List of stop words are in the data directory
"""

import re
from collections import Counter
import math

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return cleaned_text

def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def calculate_tf(text):
    # Calculate term frequency (TF) for each word in the text
    words = text.split()
    word_counts = Counter(words)
    total_words = len(words)
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

def calculate_idf(corpus):
    # Calculate inverse document frequency (IDF) for each word in the corpus
    document_count = len(corpus)
    idf = {}
    for document in corpus:
        words = set(document.split())
        for word in words:
            idf[word] = idf.get(word, 0) + 1

    idf = {word: math.log(document_count / (count + 1)) for word, count in idf.items()}
    return idf

def calculate_cosine_similarity(tf1, tf2, idf):
    # Calculate cosine similarity between two TF-IDF vectors
    intersection = set(tf1.keys()) & set(tf2.keys())
    numerator = sum(tf1[word] * tf2[word] * idf[word] ** 2 for word in intersection)
    
    magnitude1 = math.sqrt(sum((tf1[word] * idf[word]) ** 2 for word in tf1))
    magnitude2 = math.sqrt(sum((tf2[word] * idf[word]) ** 2 for word in tf2))
    
    denominator = magnitude1 * magnitude2
    if denominator == 0:
        return 0  
    
    similarity = numerator / denominator
    return similarity

from stop_words import stop_words

with open('michelle_obama_speech.txt', 'r', encoding='utf-8') as file:
    michelle_speech = file.read()

with open('melina_trump_speech.txt', 'r', encoding='utf-8') as file:
    melina_speech = file.read()

cleaned_michelle_speech = remove_stop_words(clean_text(michelle_speech), stop_words)
cleaned_melina_speech = remove_stop_words(clean_text(melina_speech), stop_words)

# Calculate TF-IDF vectors
tf_michelle = calculate_tf(cleaned_michelle_speech)
tf_melina = calculate_tf(cleaned_melina_speech)

idf_corpus = [cleaned_michelle_speech, cleaned_melina_speech]
idf = calculate_idf(idf_corpus)

# Calculate cosine similarity
similarity = calculate_cosine_similarity(tf_michelle, tf_melina, idf)
print(f"Similarity between the two speeches: {similarity:.2%}")

Similarity between the two speeches: 72.62%


In [33]:
# Find the 10 most repeated words in the romeo_and_juliet.txt
import re
from collections import Counter

# Read the content of the text file
with open('romeo_and_juliet.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Clean the text by removing non-alphanumeric characters and converting to lowercase
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

# Tokenize the text into words
words = cleaned_text.split()

# Count the occurrences of each word
word_counts = Counter(words)

# Get the 10 most common words
top_words = word_counts.most_common(10)

# Print the result
print("Top 10 most repeated words:")
for word, count in top_words:
    print(f"{word}: {count}")

Top 10 most repeated words:
the: 866
and: 793
to: 625
i: 585
of: 535
a: 528
in: 377
is: 375
that: 363
you: 362
