<a href="https://colab.research.google.com/github/sundaybest3/s24Corpus-final/blob/main/Corpus/NLTK_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK analysis example: spoken vs. written comparison (0605-updated)

In [None]:
!pip install nltk

# POS tagging using nltk and finding 'passives' and 'passives with agent'




In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model
nltk.download('averaged_perceptron_tagger')  # POS tagger model
nltk.download('wordnet')

In [None]:
# Function to read and process data from a URL
def read_and_process_data(url):
    response = requests.get(url)
    data = pd.read_csv(StringIO(response.text))
    combined_text = ' '.join(data['text'].astype(str))
    tokens = word_tokenize(combined_text)
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    return tokens

# URLs for spoken and written data
spoken_url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/Corpus/Cleantext0605.csv'
written_url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/now_final.csv'

# Process spoken data
spoken_tokens = read_and_process_data(spoken_url)

# Process written data
written_tokens = read_and_process_data(written_url)


In [None]:
# Function to find 'be + past participle' patterns
def find_be_pp(tokens):
    be_forms = re.compile(r'\b(am|is|are|get|got|was|were|been|being|getting)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 1):
        if be_forms.match(tagged_tokens[i][0]):
            if past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]}")

    return matches

# Function to find 'be + past participle + by + agent' patterns
def find_be_pp_by_agent(tokens):
    be_forms = re.compile(r'\b(am|is|are|get|got|was|were|been|being|getting)\b', re.IGNORECASE)
    past_participle_pattern = re.compile(r'\b\w+ed\b|\b\w+n\b', re.IGNORECASE)
    matches = []
    tagged_tokens = nltk.pos_tag(tokens)

    for i in range(len(tagged_tokens) - 3):
        if be_forms.match(tagged_tokens[i][0]) and past_participle_pattern.match(tagged_tokens[i + 1][0]) and tagged_tokens[i + 1][1] == 'VBN':
            if tagged_tokens[i + 2][0].lower() == 'by' and tagged_tokens[i + 3][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
                matches.append(f"{tagged_tokens[i][0]} {tagged_tokens[i + 1][0]} by {tagged_tokens[i + 3][0]}")

    return matches

# Find matches in spoken data
spoken_be_pp_matches = find_be_pp(spoken_tokens)
spoken_be_pp_by_agent_matches = find_be_pp_by_agent(spoken_tokens)

# Find matches in written data
written_be_pp_matches = find_be_pp(written_tokens)
written_be_pp_by_agent_matches = find_be_pp_by_agent(written_tokens)

print(spoken_be_pp_matches)
print(spoken_be_pp_by_agent_matches)
print(written_be_pp_matches)
print(written_be_pp_by_agent_matches)



In [None]:
# Convert lists to sets for comparison
spoken_be_pp_set = set(spoken_be_pp_matches)
spoken_be_pp_by_agent_set = set(spoken_be_pp_by_agent_matches)
written_be_pp_set = set(written_be_pp_matches)
written_be_pp_by_agent_set = set(written_be_pp_by_agent_matches)

# Find common matches within each dataset
common_spoken_matches = spoken_be_pp_by_agent_set.intersection(spoken_be_pp_set)
common_written_matches = written_be_pp_by_agent_set.intersection(written_be_pp_set)

# Print results
print("Common matches in spoken data:")
print(common_spoken_matches)

print("\nCommon matches in written data:")
print(common_written_matches)

In [None]:

# Calculate total occurrences
total_occurrences_spoken = len(spoken_be_pp_matches) + len(spoken_be_pp_by_agent_matches)
total_occurrences_written = len(written_be_pp_matches) + len(written_be_pp_by_agent_matches)


# Create the dataframe
data = {
    'Category': ['Spoken be+p.p.', 'Spoken be+p.p.+by+agent', 'Written be+p.p.', 'Written be+p.p.+by+agent'],
    'Occurrences': [len(spoken_be_pp_matches), len(spoken_be_pp_by_agent_matches), len(written_be_pp_matches), len(written_be_pp_by_agent_matches)],
    'Percentage': [
        len(spoken_be_pp_matches) / total_occurrences_spoken * 100,
        len(spoken_be_pp_by_agent_matches) / total_occurrences_spoken * 100,
        len(written_be_pp_matches) / total_occurrences_written * 100,
        len(written_be_pp_by_agent_matches) / total_occurrences_written * 100
    ]
}

df = pd.DataFrame(data)

print(df)





In [None]:
# Your data
data = {
    "Category": ["Spoken agentless passives.", "Spoken passives with agent", "Written agentless passives", "Written passives with agent"],
    "Occurrences": [620, 24, 9211, 445]
}

# Create DataFrame
df = pd.DataFrame(data)

# Colors for the bars
colors = ['skyblue', 'salmon', 'lightgreen', 'orange']

# Plotting the data
plt.figure(figsize=(10, 6))
bars = plt.bar(df["Category"], df["Occurrences"], color=colors)

# Adding numbers on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom')  # va: vertical alignment

# Adding titles and labels
plt.title('Occurrences by Category')
plt.xlabel('Category')
plt.ylabel('Occurrences')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()





In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create the contingency table
data = [[620, 9211],
        [24,445]]

# Conduct the chi-squared test
chi2, p, dof, expected = chi2_contingency(data)

# Display the results
print("Chi-squared statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

---
The End