<a href="https://colab.research.google.com/github/sundaybest3/s24Corpus-final/blob/main/Corpus/NOW_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NOW data Text pre-processing

+ Last updated (6/12)

# 🍀Process:

1. Downloaded NOW Sample Data from COCA

*   The NOW corpus (News on the Web) sample data contains 1.7 million words of data from web-based newspapers and magazines from 2010 to 2016.
*  While other resources like Google Trends show you what people are searching for, the NOW Corpus is the only structured corpus that shows you what is actually happening in the language -- virtually right up to the present time.

2. Converted Txt to csv file.

3. Removed "@", "<p>" and "<h>" characters.

4. Split Text ID.



In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/rawfile_now.txt'
df = pd.read_csv(url, delimiter='\t')  # Adjust delimiter as needed

df.to_csv('rawfile_now.csv', index=False)

# Display the DataFrame
print(df.head())


In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('rawfile_now.csv')

df = df.replace('@', '', regex=True)
df = df.replace(r'<\/?p>|<\/?h[0-9]?>', '', regex=True)

print(df.head())

df.to_csv('now_cleanfile.csv', index=False)


# 🍀Todo:

---
# Step by step to get a cleaned text for the text column in our csv file

+ Read csv file as data (using Github link)
+ Read Column 'Text' and remove time stamps and parenthetical notes, and write the cleaned text in a new column named 'Cleantext01'

## Split Text ID info


In [None]:
file_path = 'now_cleanfile.csv'
data = pd.read_csv(file_path)

# Split the 'textID' column into 'id' and 'text'
data[['id', 'text']] = data['textID'].str.split(n=1, expand=True)

# Drop the original 'textID' column
data = data.drop(columns=['textID'])

# Reorder the columns so that 'id' is the first column and 'text' is the second column
data = data[['id', 'text'] + [col for col in data.columns if col not in ['id', 'text']]]

# Display the cleaned data
print(data.head())

# Save the cleaned data to a new CSV file if needed
data.to_csv('now_final.csv', index=False)

# Find a word in the all text

1. Combine the text and find a word
2. For each text of the data ('Text'), find the word and add a new column with the number of cases found in the given text

In [None]:
!pip install nltk

### 1a. Combine the text and find 'very' for example

In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model

# 1) Read a file from URL and assign the file to 'data' dataframe
url = 'https://raw.githubusercontent.com/sundaybest3/s24Corpus-final/main/now_final.csv'  # Replace with your actual URL
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))

# 2) Display column names
print("Column names:", data.columns)
print("="*50)
# 3) Combine all items in the 'Text' column as a single string
combined_text = ''.join(data['Text'].astype(str))

# 4) Save the combined text as 'scriptall.txt'
with open('/content/scriptall.txt', 'w', encoding='utf-8') as file:
    file.write(combined_text)

# 5) Remove punctuation using NLTK and save it as 'scriptall_nopunct.txt'
tokens = word_tokenize(combined_text)
tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
text_no_punctuation = ' '.join(tokens)
with open('/content/scriptall_nopunct.txt', 'w', encoding='utf-8') as file:
    file.write(text_no_punctuation)

# 6) Search matching strings 'very' (lower or capital) and display left and right 50 characters for all occurrences
pattern = r'\bvery\b'  # Case-sensitive example; add (?i) for case-insensitive
occurrences = 0
for i, word in enumerate(tokens):
    if word.lower() == 'very':
        start = max(0, i - 10)  # Approximate word count before 'very'
        end = min(len(tokens), i + 10)  # Approximate word count after 'very'
        print(' '.join(tokens[start:end]))
        occurrences += 1

# 7) Print summary with how many occurrences are found in the given text
print("="*50)
print(f"Total occurrences found: {occurrences}")


### 1b. Combine the text and find a word (using user input)

In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model

# 1) Read a file from URL and assign the file to 'data' dataframe
url = 'https://raw.githubusercontent.com/MK316/Spring2024/main/Corpus/TEDdata/sample1.csv'
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))

# 2) Display column names
print("Column names:", data.columns)
print("="*50)

# 3) Combine all items in the 'Text' column as a single string
combined_text = ''.join(data['Text'].astype(str))

# 4) Save the combined text as 'scriptall.txt'
with open('/content/scriptall.txt', 'w', encoding='utf-8') as file:
    file.write(combined_text)

# 5) Remove punctuation using NLTK and save it as 'scriptall_nopunct.txt'
tokens = word_tokenize(combined_text)
tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
text_no_punctuation = ' '.join(tokens)
with open('/content/scriptall_nopunct.txt', 'w', encoding='utf-8') as file:
    file.write(text_no_punctuation)

# Get user input for the word to find
search_word = input("Enter the word to find: ")

# 6) Search for the input word and display left and right 50 characters (approx. 10 words) for all occurrences
occurrences = 0
for i, word in enumerate(tokens):
    if word.lower() == search_word.lower():
        start = max(0, i - 10)  # Approximate word count before the search word
        end = min(len(tokens), i + 10)  # Approximate word count after the search word
        print(' '.join(tokens[start:end]))
        occurrences += 1

# 7) Print summary with how many occurrences are found in the given text
print("="*50)
print(f"Total occurrences found: {occurrences}")


### 2. Find a word for each text and add the information as a separate column named 'CountVery'

In [None]:
#@markdown User input for a word to search, user input for the column name to record the number of occurrences
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model

# 1) Read a file from URL and assign the file to 'data' dataframe
url = 'https://raw.githubusercontent.com/MK316/Spring2024/main/Corpus/TEDdata/sample1.csv'
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))

# 2) Display column names
print("Column names:", data.columns)
print("="*50)

# Get user input for the word to find and the new column name
search_word = input("Enter the word to find: ").lower()
new_column_name = input("Enter the new column name for word occurrences: ")

# 3) Define a function to count occurrences of a specified word in a text
def count_word_occurrences(text, word):
    tokens = word_tokenize(text)
    count = sum(1 for token in tokens if token.lower() == word)
    return count

# 4) Apply this function to each item in the 'Text' column and add the result to a new column
data[new_column_name] = data['Text'].apply(lambda text: count_word_occurrences(text, search_word))

# Display the updated DataFrame
print(data[[new_column_name]].head())

# Optionally, save the updated DataFrame to a new CSV file
# data.to_csv('/content/updated_data.csv', index=False)


# **Final data to process**

## [1] Data to read

[data link](https://raw.githubusercontent.com/MK316/Spring2024/main/Corpus/TEDdata/TED100.csv)

In [None]:
import pandas as pd

datalink = "https://raw.githubusercontent.com/MK316/Spring2024/main/Corpus/TEDdata/TED100.csv"
data = pd.read_csv(datalink, encoding="utf-8")
data.head()

## [2] Cleaned data: adding a column 'Cleanedtext01'

+ data = original data
+ df = cleaned data column added

In [None]:

import pandas as pd
import re

# Assuming 'data' is your original DataFrame
df = data

def clean_text(text):
    # Remove timestamps in the format "00:00"
    text = re.sub(r'\d{2}:\d{2}\n', '', text)
    # Remove text within brackets
    text = re.sub(r'\(.*?\)', '', text)
    return text

# Apply the clean_text function to each element in the 'Text' column
df['Cleanedtext01'] = df['Text'].apply(clean_text)

# Comparing the first item of 'Text' and 'Cleanedtext01'
original_text = df['Text'].iloc[0][0:1000]  # Access the first item in the 'Text' column
cleaned_text = df['Cleanedtext01'].iloc[0][0:1000]  # Access the first item in the 'Cleanedtext01' column

print("Original Text:")
print(original_text)
print("="*50)
print("\nCleaned Text:")
print(cleaned_text)

## [3] Check whether the data cleaning is appropriately processed

In [None]:
#@markdown 1. Check the first (timestamp) for both 'Text' and 'Cleanedtext01'
import pandas as pd
import re

# Assuming 'df' is your DataFrame
def remove_and_report_timestamps(text):
    # Find all occurrences of the timestamp pattern
    matches = re.findall(r'\d{2}:\d{2}\n', text)
    # Remove the timestamp pattern
    cleaned_text = re.sub(r'\d{2}:\d{2}\n', '', text)
    return cleaned_text, matches

# Apply the function and capture the cleaned text and the matches for 'Text'
tn = input("Type the index of a text to check (1~100): ")
tn = int(tn)
n = tn-1
cleaned_text_original, timestamp_matches_original = remove_and_report_timestamps(df['Text'][n])

# Print the number of occurrences and list each occurrence for 'Text'
if timestamp_matches_original:
    print(f"Found {len(timestamp_matches_original)} occurrences of the timestamp pattern in original text:")
    for match in timestamp_matches_original:
        print(match.strip())  # .strip() is used to remove any trailing newline for clean display
else:
    print("No timestamp pattern found in the original text.")

# Apply the same function and capture the cleaned text and the matches for 'Cleanedtext01'
cleaned_text_cleaned, timestamp_matches_cleaned = remove_and_report_timestamps(df['Cleanedtext01'][n])

# Print the number of occurrences and list each occurrence for 'Cleanedtext01'
if timestamp_matches_cleaned:
    print(f"Found {len(timestamp_matches_cleaned)} occurrences of the timestamp pattern in cleaned text:")
    for match in timestamp_matches_cleaned:
        print(match.strip())
else:
    print("No timestamp pattern found in the cleaned text.")


In [None]:
#@markdown 2. Check the second (parenthetical notes) for both 'Text' and 'Cleanedtext01'
import pandas as pd
import re

# Assuming 'df' is your DataFrame
def remove_and_report_timestamps(text):
    # Find all occurrences of the timestamp pattern
    matches = re.findall(r'\(.*?\)', text)
    # Remove the timestamp pattern
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    return cleaned_text, matches

# Apply the function and capture the cleaned text and the matches for 'Text'
ts = input("Which text to check (1~100): ")
ts = int(ts)
s = ts-1
cleaned_text_original, timestamp_matches_original = remove_and_report_timestamps(df['Text'][s])

# Print the number of occurrences and list each occurrence for 'Text'
if timestamp_matches_original:
    print(f"Found {len(timestamp_matches_original)} occurrences of the timestamp pattern in original text:")
    for match in timestamp_matches_original:
        print(match.strip())  # .strip() is used to remove any trailing newline for clean display
else:
    print("No timestamp pattern found in the original text.")

# Apply the same function and capture the cleaned text and the matches for 'Cleanedtext01'
cleaned_text_cleaned, timestamp_matches_cleaned = remove_and_report_timestamps(df['Cleanedtext01'][s])

# Print the number of occurrences and list each occurrence for 'Cleanedtext01'
if timestamp_matches_cleaned:
    print(f"Found {len(timestamp_matches_cleaned)} occurrences of the timestamp pattern in cleaned text:")
    for match in timestamp_matches_cleaned:
        print(match.strip())
else:
    print("No parenthetical pattern found in the cleaned text.")


## [4] Text to combine for searching (to check)

In [None]:
import pandas as pd
import requests
from io import StringIO
import string


# 2) Combine all items in the 'Text' column as a single string and remove punctuation
combined_text = ''.join(df['Cleanedtext01'].astype(str))
combined_text = combined_text.translate(str.maketrans('', '', string.punctuation))

# 3) Get user input for the word to find
search_word = input("Enter the word to find: ")
match_type = input("Type 'c' complete matches only, or 'p' for partial matches: ").lower()

# 4) Function to find occurrences
def find_occurrences(text, word, match_type):
    occurrences = []
    position = 0
    while True:
        if match_type == 'c':
            # Find whole words only by using boundaries
            position = text.lower().find(f' {word.lower()} ', position)
        else:
            position = text.lower().find(word.lower(), position)

        if position == -1:  # No more occurrences found
            break
        # Calculate start and end positions for slicing
        start = max(0, position - 30)
        end = min(len(text), position + len(word) + 30)
        occurrences.append(text[start:end])
        position += len(word)  # Move past this occurrence

    return occurrences

occurrences = find_occurrences(combined_text, search_word, match_type)

# 5) Decide how many occurrences to display
print(f"Total occurrences found: {len(occurrences)}")
print("="*50)
if len(occurrences) > 10:
    choice = input("More than 10 occurrences found. Type 'a' to display all or '10' to display only the first 10: ").lower()
    print("="*50)
    if choice == '10':
        occurrences = occurrences[:10]

# 6) Display occurrences
for occurrence in occurrences:
    print(occurrence)

# 7) Print summary
print("="*50)


# Saving the processed file

In [None]:
print(df.head())
df.to_csv("Cleanedtext01.csv", encoding = "utf-8", index=False)