In [6]:
import re

def clean_file(input_filepath, output_filepath):
    """
    Loads a text file, replaces the string 'json' with a comma, and
    removes other specific patterns. It then saves the cleaned content.

    Args:
        input_filepath (str): The path to the input .txt file.
        output_filepath (str): The path where the cleaned content will be saved.
    """
    try:
        # Read the content of the input file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        # Step 1: Replace the literal string 'json' with a comma
        content_with_comma = content.replace('json', ',')

        # Step 2: Define the regex pattern to find all other unwanted strings and URL types.
        patterns_to_remove = re.compile(
            r'```|'  # Matches the literal string ```
            r'https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+|'  # Matches standard youtube watch URLs
            r'https?://youtu\.be/[\w-]+|'  # Matches shortened youtu.be URLs
            r'\[|\]'  # Matches literal [ and ]
        )

        # Use re.sub to replace all occurrences of the patterns with an empty string
        cleaned_content = patterns_to_remove.sub('', content_with_comma)

        # Remove any extra whitespace created by the removal
        cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()

        # Write the cleaned content to the output file
        with open(output_filepath, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

        print(f"Successfully cleaned '{input_filepath}' and saved to '{output_filepath}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_filepath}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
# Assuming you have a file named 'input.txt' in the same directory.
# This will create a new file named 'output.txt' with the cleaned content.
if __name__ == "__main__":
    input_file = 'data/rel_new.txt'
    output_file = 'data/rel.txt'
    clean_file(input_file, output_file)


Successfully cleaned 'data/rel_new.txt' and saved to 'data/rel.txt'.


In [11]:
import json


In [20]:
import pyarrow as pa
import pyarrow.parquet as pq

# Define the schema with your fields
my_schema = pa.schema([
    pa.field("video_topic", pa.string()),
    pa.field("segment_description", pa.string()),
    pa.field("subtitle", pa.string()),
    pa.field("label", pa.string())
])

# Create an empty table by providing a dictionary with empty lists for each field
empty_table = pa.Table.from_pydict({
    "video_topic": [],
    "segment_description": [],
    "subtitle": [],
    "label": []
}, schema=my_schema)

# Write the empty table to a Parquet file
pq.write_table(empty_table, 'relevant.parquet')

In [21]:
import random

In [27]:
import pyarrow as pa
import pyarrow.parquet as pq
import json

# Define the schema of your Parquet file (must match the existing file)
my_schema = pa.schema([
    pa.field("video_topic", pa.string()),
    pa.field("segment_description", pa.string()),
    pa.field("subtitle", pa.string()),
    pa.field("label", pa.string())
])
with open("data/rel.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
data_dict = {
    "video_topic": [],
    "segment_description": [],
    "subtitle": [],
    "label": []
}
for entry in data:
    data_dict['video_topic'].append(entry["video_topic"])
    data_dict["segment_description"].append(entry["segment_description"])
    if entry['subtitle'] == "":
        data_dict["subtitle"].append(random.choice(["**No one is talking**", "**Silent**","**No subtitle as silent**" ]))
    else:
        data_dict['subtitle'].append(entry['subtitle'])

    data_dict["label"].append(entry["label"])

# Create a ParquetWriter instance to append to the file
writer = pq.ParquetWriter('relevant.parquet', my_schema)
# Create a PyArrow Table with the data you want to add
new_data = pa.Table.from_pydict(data_dict)
# Write the new data to the file
writer.write_table(new_data)

# Close the writer to save and finalize the file
writer.close()

In [29]:
try:
    irrelevant_parquet_file = pq.ParquetFile("irrelevant.parquet")
    relevant_parquet_file = pq.ParquetFile("relevant.parquet")
    print(f"The Irrelevant Parquet file has {irrelevant_parquet_file.metadata.num_rows} rows.")
    print(f"The Relevant Parquet file has {relevant_parquet_file.metadata.num_rows} rows.")
except FileNotFoundError:
    print(f"Error: The file was not found.")

The Irrelevant Parquet file has 2602 rows.
The Relevant Parquet file has 2012 rows.


### Preparing the Data for trainint

In [9]:
# first shuffle both separate then 12% irr, 8% rel for test
import pandas as pd
import numpy as np
import os

def create_data_splits(relevant_file, irrelevant_file, random_seed=42):
    """
    Create train/validation/test splits from relevant and irrelevant Parquet files.
    
    Args:
        relevant_file (str): Path to relevant.parquet file
        irrelevant_file (str): Path to irrelevant.parquet file
        random_seed (int): Random seed for reproducibility
    
    Returns:
        dict: Dictionary containing the six DataFrames (train/val/test for both classes)
    """
    
    # Set random seed for reproducibility
    np.random.seed(random_seed)
    
    # Load the Parquet files
    print("Loading Parquet files...")
    relevant_df = pd.read_parquet(relevant_file)
    irrelevant_df = pd.read_parquet(irrelevant_file)
    
    print(f"Relevant data: {len(relevant_df)} rows")
    print(f"Irrelevant data: {len(irrelevant_df)} rows")
    
    
    # Shuffle both datasets
    print("Shuffling datasets...")
    relevant_shuffled = relevant_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    irrelevant_shuffled = irrelevant_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Calculate 5% sample sizes for each class
    relevant_sample_size = max(1, int(0.1 * len(relevant_shuffled)))  # At least 1 sample
    irrelevant_sample_size = max(1, int(0.1 * len(irrelevant_shuffled)))  # At least 1 sample
    
    print(f"Sample size - Relevant: {relevant_sample_size}, Irrelevant: {irrelevant_sample_size}")
    
    # Create validation set (5% from each class)
    relevant_val = relevant_shuffled.iloc[:relevant_sample_size]
    irrelevant_val = irrelevant_shuffled.iloc[:irrelevant_sample_size]
    validation_set = pd.concat([relevant_val, irrelevant_val], ignore_index=True)
    
    # Create test set (next 5% from each class)
    relevant_test = relevant_shuffled.iloc[relevant_sample_size:relevant_sample_size*2]
    irrelevant_test = irrelevant_shuffled.iloc[irrelevant_sample_size:irrelevant_sample_size*2]
    test_set = pd.concat([relevant_test, irrelevant_test], ignore_index=True)
    
    # The remaining data goes to training set
    relevant_train = relevant_shuffled.iloc[relevant_sample_size*2:]
    irrelevant_train = irrelevant_shuffled.iloc[irrelevant_sample_size*2:]
    training_set = pd.concat([relevant_train, irrelevant_train], ignore_index=True)
    
    # Shuffle the final sets
    validation_set = validation_set.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    test_set = test_set.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    training_set = training_set.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Print summary statistics
    print("\n=== Final Dataset Split Summary ===")
    print(f"Training set: {len(training_set)} rows")
    print(f"  - Relevant: {len(training_set[training_set['label'] == 'Relevant'])}")
    print(f"  - Irrelevant: {len(training_set[training_set['label'] == 'Irrelevant'])}")
    
    print(f"Validation set: {len(validation_set)} rows")
    print(f"  - Relevant: {len(validation_set[validation_set['label'] == 'Relevant'])}")
    print(f"  - Irrelevant: {len(validation_set[validation_set['label'] == 'Irrelevant'])}")
    
    print(f"Test set: {len(test_set)} rows")
    print(f"  - Relevant: {len(test_set[test_set['label'] == 'Relevant'])}")
    print(f"  - Irrelevant: {len(test_set[test_set['label'] == 'Irrelevant'])}")
    
    # Calculate percentages
    total_rows = len(training_set) + len(validation_set) + len(test_set)
    print(f"\nTotal rows: {total_rows}")
    print(f"Training: {len(training_set)/total_rows*100:.1f}%")
    print(f"Validation: {len(validation_set)/total_rows*100:.1f}%")
    print(f"Test: {len(test_set)/total_rows*100:.1f}%")
    
    return {
        'train': training_set,
        'validation': validation_set,
        'test': test_set,
    }

def save_splits(splits, output_dir='output_splits'):
    """
    Save the splits as Parquet files.
    
    Args:
        splits (dict): Dictionary containing the split DataFrames
        output_dir (str): Directory to save the output files
    """
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save main splits
    splits['train'].to_parquet(f'{output_dir}/train_set.parquet', index=False)
    splits['validation'].to_parquet(f'{output_dir}/validation_set.parquet', index=False)
    splits['test'].to_parquet(f'{output_dir}/test_set.parquet', index=False)
    
    # Save individual class splits (optional)
    # splits['train_relevant'].to_parquet(f'{output_dir}/train_relevant.parquet', index=False)
    # splits['train_irrelevant'].to_parquet(f'{output_dir}/train_irrelevant.parquet', index=False)
    # splits['val_relevant'].to_parquet(f'{output_dir}/val_relevant.parquet', index=False)
    # splits['val_irrelevant'].to_parquet(f'{output_dir}/val_irrelevant.parquet', index=False)
    # splits['test_relevant'].to_parquet(f'{output_dir}/test_relevant.parquet', index=False)
    # splits['test_irrelevant'].to_parquet(f'{output_dir}/test_irrelevant.parquet', index=False)
    
    print(f"\nAll splits saved to '{output_dir}' directory")

# Main execution
if __name__ == "__main__":
    # File paths - adjust these as needed
    relevant_file = "data/relevant.parquet"
    irrelevant_file = "data/irrelevant.parquet"
    
    # Check if files exist
    if not os.path.exists(relevant_file):
        print(f"Error: {relevant_file} not found!")
        exit(1)
    if not os.path.exists(irrelevant_file):
        print(f"Error: {irrelevant_file} not found!")
        exit(1)
    
    try:
        # Create the splits
        splits = create_data_splits(relevant_file, irrelevant_file, random_seed=42)
        
        # Save the splits
        save_splits(splits, "data")
        
        print("\n=== Split Creation Complete ===")
        print("Files created:")
        print("- train_set.parquet (main training set)")
        print("- validation_set.parquet (main validation set)") 
        print("- test_set.parquet (main test set)")
        print("- Plus individual class files for each split")
        
    except Exception as e:
        print(f"Error occurred: {e}")

Loading Parquet files...
Relevant data: 2012 rows
Irrelevant data: 2602 rows
Shuffling datasets...
Sample size - Relevant: 201, Irrelevant: 260

=== Final Dataset Split Summary ===
Training set: 3692 rows
  - Relevant: 1610
  - Irrelevant: 2082
Validation set: 461 rows
  - Relevant: 201
  - Irrelevant: 260
Test set: 461 rows
  - Relevant: 201
  - Irrelevant: 260

Total rows: 4614
Training: 80.0%
Validation: 10.0%
Test: 10.0%

All splits saved to 'data' directory

=== Split Creation Complete ===
Files created:
- train_set.parquet (main training set)
- validation_set.parquet (main validation set)
- test_set.parquet (main test set)
- Plus individual class files for each split


In [14]:
with open("data/irrelevant_full.json", 'r') as f:
    data = json.load(f)
mx = 0
idx = 0
for i, p in enumerate(data):
    ln = len(p['subtitle'])
    mx = max(ln, mx)
    if(ln == 4644):
        print(i)
        print(p)
print(mx)


870
{'video_topic': 'An educational guide on how to prepare and deliver a webinar presentation for the ENG091 course.', 'segment_description': 'This extended segment is dedicated to administrative and logistical tasks. The instructor explains the rules for the upcoming group presentation, divides the students into groups, takes attendance, and addresses various student questions related to the assignment and group formations. This entire portion deviates from the academic subject of the lecture.', 'subtitle': "and then what are we going to do for a webinar presentation in English 091? First I'm going to divide you into four groups not five groups four groups. Then you will see there are four articles on buX and I will assign one article to each group. Then, every participant will present the different main ideas or components of the article. So let's say this is a long article, right? This is probably an article of let's say 10 or 15 paragraphs, alright because the length of the articl

In [18]:
s = "nabil"+"\n"+"zero"
print(s)

nabil
zero


In [None]:
Let's take attendance now. All assignments submitted? Confirmed? Yes. Shakib Shadman? Present. Munim? Absent. Mostofa? Absent. Istiak? Yes. Tanjila? Absent. Nibir, Haque, present. Abdullah Sany? Absent. Mahmuda Tabassum? Absent. Khalid? Tanvir, Farhin, Atik, present. Tasnim, present. Nuzhat? Sabuj, present. Hosamuddin? Absent. Riyad, Shoyeb? Yes. Mahmuda joined. Iffat? Tabassum Raya? Sifat? Present. Tahmida, present. Tabassum Bushra? Mehedi, present.
