<a href="https://colab.research.google.com/github/sundaybest3/s24Corpus-final/blob/main/Corpus/NOW_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NOW data Text pre-processing

+ Last updated (6/12)

# 🍀Process:

1. Downloaded NOW Sample Data from COCA

*   The NOW corpus (News on the Web) sample data contains 1.7 million words of data from web-based newspapers and magazines from 2010 to 2016.
*  While other resources like Google Trends show you what people are searching for, the NOW Corpus is the only structured corpus that shows you what is actually happening in the language -- virtually right up to the present time.

2. Converted Txt to csv file.

3. Removed "@", "<p>" and "<h>" characters.

4. Split Text ID.



In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/rawfile_now.txt'
df = pd.read_csv(url, delimiter='\t')  # Adjust delimiter as needed

df.to_csv('rawfile_now.csv', index=False)

# Display the DataFrame
print(df.head())


In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('rawfile_now.csv')

df = df.replace('@', '', regex=True)
df = df.replace(r'<\/?p>|<\/?h[0-9]?>', '', regex=True)

print(df.head())

df.to_csv('now_cleanfile.csv', index=False)


# 🍀Todo:

---
# Step by step to get a cleaned text for the text column in our csv file

+ Read csv file as data (using Github link)
+ Read Column 'Text' and remove time stamps and parenthetical notes, and write the cleaned text in a new column named 'Cleantext01'

## Split Text ID info


In [None]:
file_path = 'now_cleanfile.csv'
data = pd.read_csv(file_path)

# Split the 'textID' column into 'id' and 'text'
data[['id', 'text']] = data['textID'].str.split(n=1, expand=True)

# Drop the original 'textID' column
data = data.drop(columns=['textID'])

# Reorder the columns so that 'id' is the first column and 'text' is the second column
data = data[['id', 'text'] + [col for col in data.columns if col not in ['id', 'text']]]

# Display the cleaned data
print(data.head())

# Save the cleaned data to a new CSV file if needed
data.to_csv('now_final.csv', index=False)

# Find a word in the all text

1. Combine the text and find a word
2. For each text of the data ('Text'), find the word and add a new column with the number of cases found in the given text

In [None]:
!pip install nltk

### 1a. Combine the text and find 'be+p.p.' for example




In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import re  # Ensure re is imported

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model

# 1) Read a file from URL and assign the file to 'data' dataframe
url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/now_final.csv'  # Replace with your actual URL
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))

# 2) Display column names
print("Column names:", data.columns)
print("="*50)

# 3) Combine all items in the 'text' column as a single string
combined_text = ' '.join(data['text'].astype(str))

# 4) Save the combined text as 'scriptall.txt'
with open('scriptall.txt', 'w', encoding='utf-8') as file:
    file.write(combined_text)

# 5) Remove punctuation using NLTK and save it as 'scriptall_nopunct.txt'
tokens = word_tokenize(combined_text)
tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
text_no_punctuation = ' '.join(tokens)
with open('scriptall_nopunct.txt', 'w', encoding='utf-8') as file:
    file.write(text_no_punctuation)

# 6) Search matching strings 'be + past participle' and display left and right 50 characters for all occurrences
pattern = r'\b(am|is|are|was|were|been|being)\b\s+\b(\w+ed|\w+n)\b'  # Regular expression for 'be + past participle'
matches = []
for i in range(len(tokens) - 1):
    if re.match(r'\b(am|is|are|was|were|been|being)\b', tokens[i], re.IGNORECASE):
        if re.match(r'\b\w+ed\b|\b\w+n\b', tokens[i + 1], re.IGNORECASE):
            start = max(0, i - 10)  # Approximate word count before the match
            end = min(len(tokens), i + 11)  # Approximate word count after the match
            matches.append(' '.join(tokens[start:end]))

# Print all matches
for match in matches:
    print(match)

# 7) Print summary with how many occurrences are found in the given text
print("="*50)
print(f"Total occurrences found: {len(matches)}")

### 1b. Find 'be+p.p.+by+agent'

In [None]:
# 6) Search for 'be + past participle + by + agent' and display left and right 50 characters for all occurrences
pattern = re.compile(r'\b(am|is|are|was|were|been|being)\b\s+\b(\w+ed|\w+n)\b\s+by\s+\b(\w+)\b', re.IGNORECASE)  # Regular expression for 'be + past participle + by + agent'
matches = []
for match in pattern.finditer(text_no_punctuation):
    start = max(0, match.start() - 50)
    end = min(len(text_no_punctuation), match.end() + 50)
    matches.append(text_no_punctuation[start:end])

# Print all matches
for match in matches:
    print(match)

# 7) Print summary with how many occurrences are found in the given text
print("="*50)
print(f"Total occurrences found: {len(matches)}")


# **Final data to process**

## [2] Cleaned data: adding a column 'Cleanedtext01'

+ data = original data
+ df = cleaned data column added

## [3] Check whether the data cleaning is appropriately processed