## This program opens the bucket with all of the papers in pdf format and then analyzes them 

In [None]:
import boto3
import csv
import re
from collections import Counter
import string

s3 = boto3.client('s3')
bucket_name = 'generated-research'

# Define a function to calculate Zipf's law compliance
def calculate_zipf_law(word_counts):
    sorted_counts = sorted(word_counts.values(), reverse=True)
    zipf_ratios = [sorted_counts[i] / sorted_counts[0] for i in range(1, len(sorted_counts))]
    return zipf_ratios  # Ratios to measure adherence to Zipf's law

# Define CSV headers
csv_headers = [
    "file_name", 
    "average_sentence_length", 
    "average_word_length", 
    "comma_frequency", 
    "punctuation_frequency", 
    "unique_word_count", 
    "zipf_ratio"
]

# Prepare CSV file
with open('analyzed_data.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_headers)  # Write headers
    response = s3.list_objects_v2(Bucket=bucket_name)

    if 'Contents' in response:
        for obj in response['Contents']:
            key = obj['Key']  # Get the object key (file name)
            
            # Only process .txt files
            if key.endswith('.txt'):
                try:
                    # Attempt to fetch and decode the file content as UTF-8
                    file_content = s3.get_object(Bucket=bucket_name, Key=key)['Body'].read().decode('utf-8')
                except UnicodeDecodeError:
                    # If UTF-8 decoding fails, fall back to ISO-8859-1
                    file_content = s3.get_object(Bucket=bucket_name, Key=key)['Body'].read().decode('ISO-8859-1')
                
                # Word and sentence level analysis
                words = re.findall(r'\b\w+\b', file_content)
                sentences = re.split(r'[.!?]', file_content)
                
                # Calculate features
                word_lengths = [len(word) for word in words]
                average_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else 0
                average_sentence_length = sum(len(sentence.split()) for sentence in sentences if sentence) / len(sentences) if sentences else 0
                unique_word_count = len(set(words))
                
                # Comma and punctuation frequencies
                comma_count = file_content.count(',')
                punctuation_count = sum(1 for char in file_content if char in string.punctuation)
                
                # Calculate Zipf's Law adherence (average ratio of word frequencies)
                word_counts = Counter(words)
                zipf_ratios = calculate_zipf_law(word_counts)
                average_zipf_ratio = sum(zipf_ratios) / len(zipf_ratios) if zipf_ratios else 0
                
                # Write row to CSV
                writer.writerow([
                    key, 
                    average_sentence_length, 
                    average_word_length, 
                    comma_count, 
                    punctuation_count, 
                    unique_word_count, 
                    average_zipf_ratio
                ])

print("Analysis complete. Data saved to 'analyzed_data.csv'")
