<a href="https://colab.research.google.com/github/sundaybest3/s24Corpus-final/blob/main/Corpus/NLTK_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK analysis example: spoken vs. written comparison (0605-updated)

In [None]:
!pip install nltk

# POS tagging using nltk and finding 'be+p.p.' and 'be+p.p.+by+agent'




In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model
nltk.download('averaged_perceptron_tagger')  # POS tagger model

In [None]:
# Function to read and process data from a URL
def read_and_process_data(url):
    response = requests.get(url)
    data = pd.read_csv(StringIO(response.text))
    combined_text = ' '.join(data['text'].astype(str))
    tokens = word_tokenize(combined_text)
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    return tokens

# URLs for spoken and written data
spoken_url = 'https://github.com/sundaybest3/s24Corpus-final/raw/main/Corpus/TEDdata/Cleantext0605.csv'  # Replace with your actual URL for spoken data
written_url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/now_final.csv'  # Replace with your actual URL for written data

# Process spoken data
spoken_tokens = read_and_process_data(spoken_url)

# Process written data
written_tokens = read_and_process_data(written_url)


In [None]:
# Function to find 'be + past participle' patterns
def find_be_pp(tokens):
    be_forms = re.compile(r'\b(am|is|are|was|were|been|being)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 1):
        if be_forms.match(tagged_tokens[i][0]):
            if past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]}")

    return matches

# Function to find 'be + past participle + by + agent' patterns
def find_be_pp_by_agent(tokens):
    be_forms = re.compile(r'\b(am|is|are|was|were|been|being)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 3):
        if be_forms.match(tagged_tokens[i][0]) and past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
            if tagged_tokens[i + 2][0].lower() == 'by' and tagged_tokens[i + 3][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]} by {tagged_tokens[i + 3][0]}")

    return matches

# Find matches in spoken data
spoken_be_pp_matches = find_be_pp(spoken_tokens)
spoken_be_pp_by_agent_matches = find_be_pp_by_agent(spoken_tokens)

# Find matches in written data
written_be_pp_matches = find_be_pp(written_tokens)
written_be_pp_by_agent_matches = find_be_pp_by_agent(written_tokens)

# Calculate total occurrences
total_occurrences = sum([
    len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches),
    len(written_be_pp_matches), len(written_be_pp_by_agent_matches)
])


In [None]:

# Create a DataFrame to display the results and percentages
data = {
    'Pattern': ['be + past participle', 'be + past participle + by + agent'],
    'Spoken Occurrences': [len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches)],
    'Written Occurrences': [len(written_be_pp_matches), len(written_be_pp_by_agent_matches)]
}

df_occurrences = pd.DataFrame(data)
df_occurrences['Total Occurrences'] = df_occurrences['Spoken Occurrences'] + df_occurrences['Written Occurrences']
df_occurrences['Percentage'] = (df_occurrences['Total Occurrences'] / total_occurrences) * 100

print(df_occurrences)



In [None]:

# Visualization
labels = ['Spoken be+p.p.', 'Spoken be+p.p.+by+agent', 'Written be+p.p.', 'Written be+p.p.+by+agent']
values = [len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches), len(written_be_pp_matches), len(written_be_pp_by_agent_matches)]

plt.figure(figsize=(10, 6))
plt.bar(labels, values, color=['blue', 'green', 'orange', 'red'])
plt.xlabel('Patterns and Data Types')
plt.ylabel('Number of Occurrences')
plt.title('Occurrences of "be + past participle" and "be + past participle + by + agent"')
plt.show()

plt.savefig('Occurrences_comparison.png')  # Save the plot as a file

In [None]:
# Chi-Squared Test
observed = [
    [len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches)],
    [len(written_be_pp_matches), len(written_be_pp_by_agent_matches)]
]

chi2_stat, p_val, dof, ex = chi2_contingency(observed)

print(f"\nChi-squared Test:")
print(f"Chi-squared Statistic: {chi2_stat}")
print(f"p-value: {p_val}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies: {ex}")





---
The End