<a href="https://colab.research.google.com/github/sundaybest3/s24Corpus-final/blob/main/Corpus/NLTK_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK analysis example: spoken vs. written comparison (0605-updated)

In [None]:
!pip install nltk

# POS tagging using nltk and finding 'be+p.p.' and 'be+p.p.+by+agent'




In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model
nltk.download('averaged_perceptron_tagger')  # POS tagger model
nltk.download('wordnet')

In [None]:
# Function to read and process data from a URL
def read_and_process_data(url):
    response = requests.get(url)
    data = pd.read_csv(StringIO(response.text))
    combined_text = ' '.join(data['text'].astype(str))
    tokens = word_tokenize(combined_text)
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    return tokens

# URLs for spoken and written data
spoken_url = 'https://github.com/sundaybest3/s24Corpus-final/raw/main/Corpus/TEDdata/Cleantext0605.csv'  # Replace with your actual URL for spoken data
written_url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/now_final.csv'  # Replace with your actual URL for written data

# Process spoken data
spoken_tokens = read_and_process_data(spoken_url)

# Process written data
written_tokens = read_and_process_data(written_url)


In [None]:
# Function to find 'be + past participle' patterns
def find_be_pp(tokens):
    be_forms = re.compile(r'\b(am|is|are|get|got|was|were|been|being)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 1):
        if be_forms.match(tagged_tokens[i][0]):
            if past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]}")

    return matches

# Function to find 'be + past participle + by + agent' patterns
def find_be_pp_by_agent(tokens):
    be_forms = re.compile(r'\b(am|is|are|get|got|was|were|been|being)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 3):
        if be_forms.match(tagged_tokens[i][0]) and past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
            if tagged_tokens[i + 2][0].lower() == 'by' and tagged_tokens[i + 3][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]} by {tagged_tokens[i + 3][0]}")

    return matches

# Find matches in spoken data
spoken_be_pp_matches = find_be_pp(spoken_tokens)
spoken_be_pp_by_agent_matches = find_be_pp_by_agent(spoken_tokens)

# Find matches in written data
written_be_pp_matches = find_be_pp(written_tokens)
written_be_pp_by_agent_matches = find_be_pp_by_agent(written_tokens)

# Calculate total occurrences
total_occurrences_spoken = len(spoken_be_pp_matches) + len(spoken_be_pp_by_agent_matches)
total_occurrences_written = len(written_be_pp_matches) + len(written_be_pp_by_agent_matches)
total_occurrences = total_occurrences_spoken + total_occurrences_written




In [None]:
# Create a dataframe
data = {
    'Category': ['Spoken passives', 'Spoken passives with agent', 'Written passives', 'Written passives with agent'],
    'Occurrences': [len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches), len(written_be_pp_matches), len(written_be_pp_by_agent_matches)],
    'Total Occurrences': [total_occurrences_spoken, total_occurrences_spoken, total_occurrences_written, total_occurrences_written],
    'Percentage': [
        len(spoken_be_pp_matches) / total_occurrences * 100,
        len(spoken_be_pp_by_agent_matches) / total_occurrences * 100,
        len(written_be_pp_matches) / total_occurrences * 100,
        len(written_be_pp_by_agent_matches) / total_occurrences * 100
    ]
}

df = pd.DataFrame(data)

# Display the dataframe
print(df)

In [None]:
# Create the dataframe with reordered categories
data = {
    'Category': ['Spoken passives', 'Written passives', 'Spoken passives with agent', 'Written passives with agent'],
    'Occurrences': [620, 9211, 24, 445],
    'Total Occurrences': [644, 9656, 644, 9656],
    'Percentage': [6.019417, 89.427184, 0.233010, 4.320388]
}

df = pd.DataFrame(data)

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(df['Category'], df['Occurrences'], color=['blue', 'orange', 'green', 'red'])

# Annotate each bar with the value
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom')

plt.xlabel('Category')
plt.ylabel('Occurrences')
plt.title('Occurrences of Passives and Passives with Agent in Spoken and Written Text')

# Save the plot as a file
plt.savefig('Occurrences_comparison.png')

# Display the plot
plt.show()

In [None]:
# Create the dataframe
data = {
    'Category': ['Spoken passives', 'Spoken passives with agent', 'Written passives', 'Written passives with agent'],
    'Occurrences': [620, 24, 9211, 445],
    'Total Occurrences': [644, 644, 9656, 9656],
    'Percentage': [6.019417, 0.233010, 89.427184, 4.320388]
}

df = pd.DataFrame(data)

# Pie chart
fig, ax = plt.subplots(figsize=(10, 6))
ax.pie(df['Percentage'], labels=df['Category'], autopct='%1.1f%%', startangle=90, colors=['blue', 'green', 'orange', 'red'])

plt.title('Percentage of Passives and Passives with Agent in Spoken and Written Text')

# Save the plot as a file
plt.savefig('Percentage_comparison.png')

# Display the plot
plt.show()

In [None]:
from statsmodels.stats.proportion import proportions_ztest

# Counts of passives without agent
count = np.array([620, 9211])
nobs = np.array([644, 9656])

# Perform the two-proportion z-test
stat, pval = proportions_ztest(count, nobs)

print(f"Z-statistic: {stat}")
print(f"p-value: {pval}")

In [None]:
# Counts of passives with agent
count_with_agent = np.array([24, 445])
nobs_with_agent = np.array([644, 9656])

# Perform the two-proportion z-test
stat_with_agent, pval_with_agent = proportions_ztest(count_with_agent, nobs_with_agent)

print(f"Z-statistic (with agent): {stat_with_agent}")
print(f"p-value (with agent): {pval_with_agent}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data for visualization
categories = ['Spoken passives', 'Written passives', 'Spoken passives with agent', 'Written passives with agent']
proportions = [620/644, 9211/9656, 24/644, 445/9656]

# Create a bar plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(categories, proportions, color=['blue', 'orange', 'green', 'red'])

# Annotate each bar with the value
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2%}', ha='center', va='bottom')

plt.xlabel('Category')
plt.ylabel('Proportion')
plt.title('Proportions of Passives and Passives with Agent in Spoken and Written Texts')

# Save the plot as a file
plt.tight_layout()
plt.savefig('Proportions_comparison.png')

# Display the plot
plt.show()

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

# Create the contingency table
data = np.array([[620, 24],
                 [9211, 445]])

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(data)

# Output the results
print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:\n", expected)



---
The End