In [6]:
import re

def clean_file(input_filepath, output_filepath):
    """
    Loads a text file, replaces the string 'json' with a comma, and
    removes other specific patterns. It then saves the cleaned content.

    Args:
        input_filepath (str): The path to the input .txt file.
        output_filepath (str): The path where the cleaned content will be saved.
    """
    try:
        # Read the content of the input file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        # Step 1: Replace the literal string 'json' with a comma
        content_with_comma = content.replace('json', ',')

        # Step 2: Define the regex pattern to find all other unwanted strings and URL types.
        patterns_to_remove = re.compile(
            r'```|'  # Matches the literal string ```
            r'https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+|'  # Matches standard youtube watch URLs
            r'https?://youtu\.be/[\w-]+|'  # Matches shortened youtu.be URLs
            r'\[|\]'  # Matches literal [ and ]
        )

        # Use re.sub to replace all occurrences of the patterns with an empty string
        cleaned_content = patterns_to_remove.sub('', content_with_comma)

        # Remove any extra whitespace created by the removal
        cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()

        # Write the cleaned content to the output file
        with open(output_filepath, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

        print(f"Successfully cleaned '{input_filepath}' and saved to '{output_filepath}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_filepath}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
# Assuming you have a file named 'input.txt' in the same directory.
# This will create a new file named 'output.txt' with the cleaned content.
if __name__ == "__main__":
    input_file = 'data/rel_new.txt'
    output_file = 'data/rel.txt'
    clean_file(input_file, output_file)


Successfully cleaned 'data/rel_new.txt' and saved to 'data/rel.txt'.


In [7]:
import json

with open("data/rel.json") as f:
    data = json.load(f)
len(data)

2012

In [20]:
import pyarrow as pa
import pyarrow.parquet as pq

# Define the schema with your fields
my_schema = pa.schema([
    pa.field("video_topic", pa.string()),
    pa.field("segment_description", pa.string()),
    pa.field("subtitle", pa.string()),
    pa.field("label", pa.string())
])

# Create an empty table by providing a dictionary with empty lists for each field
empty_table = pa.Table.from_pydict({
    "video_topic": [],
    "segment_description": [],
    "subtitle": [],
    "label": []
}, schema=my_schema)

# Write the empty table to a Parquet file
pq.write_table(empty_table, 'relevant.parquet')

In [21]:
import random

In [27]:
import pyarrow as pa
import pyarrow.parquet as pq
import json

# Define the schema of your Parquet file (must match the existing file)
my_schema = pa.schema([
    pa.field("video_topic", pa.string()),
    pa.field("segment_description", pa.string()),
    pa.field("subtitle", pa.string()),
    pa.field("label", pa.string())
])
with open("data/rel.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
data_dict = {
    "video_topic": [],
    "segment_description": [],
    "subtitle": [],
    "label": []
}
for entry in data:
    data_dict['video_topic'].append(entry["video_topic"])
    data_dict["segment_description"].append(entry["segment_description"])
    if entry['subtitle'] == "":
        data_dict["subtitle"].append(random.choice(["**No one is talking**", "**Silent**","**No subtitle as silent**" ]))
    else:
        data_dict['subtitle'].append(entry['subtitle'])

    data_dict["label"].append(entry["label"])

# Create a ParquetWriter instance to append to the file
writer = pq.ParquetWriter('relevant.parquet', my_schema)
# Create a PyArrow Table with the data you want to add
new_data = pa.Table.from_pydict(data_dict)
# Write the new data to the file
writer.write_table(new_data)

# Close the writer to save and finalize the file
writer.close()

In [29]:
try:
    irrelevant_parquet_file = pq.ParquetFile("irrelevant.parquet")
    relevant_parquet_file = pq.ParquetFile("relevant.parquet")
    print(f"The Irrelevant Parquet file has {irrelevant_parquet_file.metadata.num_rows} rows.")
    print(f"The Relevant Parquet file has {relevant_parquet_file.metadata.num_rows} rows.")
except FileNotFoundError:
    print(f"Error: The file was not found.")

The Irrelevant Parquet file has 2602 rows.
The Relevant Parquet file has 2012 rows.
