In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import string

True


In [None]:
# Extracting Video Transcripts
#The YouTube-8M dataset provides video IDs and features but does not include transcripts. 
# To obtain transcripts, use the YouTube Data API or the youtube-transcript-api Python library.


#pip install youtube-transcript-api
from youtube_transcript_api import YouTubeTranscriptApi

# Replace with your video ID
video_id = 'YOUR_VIDEO_ID'

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    for entry in transcript:
        print(f"{entry['start']}: {entry['text']}")
except Exception as e:
    print(f"An error occurred: {e}")



In [None]:
#CHECK

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, VideoUnavailable
import pandas as pd

def fetch_transcripts(video_ids):
    """
    Fetch transcripts for a list of YouTube video IDs.
    Returns a dictionary with video IDs as keys and transcripts as values.
    """
    transcripts = {}
    for video_id in video_ids:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            transcripts[video_id] = " ".join([entry['text'] for entry in transcript])
        except TranscriptsDisabled:
            print(f"Transcripts are disabled for video ID {video_id}")
            transcripts[video_id] = None
        except VideoUnavailable:
            print(f"Video ID {video_id} is unavailable")
            transcripts[video_id] = None
        except Exception as e:
            print(f"Error fetching transcript for video ID {video_id}: {e}")
            transcripts[video_id] = None
    return transcripts


def preprocess_transcripts(transcripts):
    """
    Preprocess transcripts for all videos.
    Returns a dictionary with video IDs as keys and preprocessed transcripts as values.
    """
    processed_transcripts = {}
    for video_id, transcript in transcripts.items():
        if transcript:
            processed_transcripts[video_id] = " ".join(preprocess_text(transcript))
        else:
            processed_transcripts[video_id] = None
    return processed_transcripts


def compute_tfidf_per_video(processed_transcripts):
    """
    Compute TF-IDF scores for processed transcripts.
    Returns a dictionary with video IDs as keys and TF-IDF scores as values.
    """
    video_ids = list(processed_transcripts.keys())
    transcripts = [processed_transcripts[vid] for vid in video_ids if processed_transcripts[vid] is not None]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(transcripts)
    feature_names = vectorizer.get_feature_names_out()

    tfidf_scores = {
        video_ids[i]: dict(zip(feature_names, tfidf_matrix[i].toarray().flatten()))
        for i in range(len(video_ids))
        if processed_transcripts[video_ids[i]] is not None
    }
    return tfidf_scores

def save_to_csv(video_ids, processed_transcripts, tfidf_scores, output_file):
    """
    Save processed data to a CSV file.
    """
    data = {
        "video_id": video_ids,
        "cleaned_transcript": [processed_transcripts[vid] for vid in video_ids],
        "tfidf_scores": [tfidf_scores.get(vid, None) for vid in video_ids],
        "generated_labels": [None] * len(video_ids)  # Placeholder
    }
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")


In [None]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    # Apply stemming and lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    
    return tokens

In [None]:
def compute_tfidf(transcripts):
    """
    Compute Term Frequency-Inverse Document Frequency (TF-IDF) for transcripts to remove common words.
    Returns a dictionary of terms and their scores.
    This step weights terms by their importance within a segment of a transcript. 
    Common words across the entire video are downweighted, ensuring that unique, topic-specific words are emphasized.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(transcripts)  # TF-IDF matrix
    feature_names = vectorizer.get_feature_names_out()
    
    # Map terms to their scores for each transcript
    tfidf_scores = []
    for row in tfidf_matrix:
        tfidf_scores.append(dict(zip(feature_names, row.toarray().flatten())))
    
    return tfidf_scores

In [None]:
# Example data
# data = {
#     "video_id": ["vid1", "vid2"],
#     "raw_transcript": [
#         "This is the first transcript for video one.",
#         "This is another example transcript for video two."
#     ],
#     "annotated_labels": [["label1", "label2"], ["label3", "label4"]]
# } 

# Preprocess transcripts
data["cleaned_transcript"] = [preprocess_text(transcript) for transcript in data["raw_transcript"]]

# Compute TF-IDF for cleaned transcripts
tfidf_scores = compute_tfidf([" ".join(tokens) for tokens in data["cleaned_transcript"]])

# Organize into a DataFrame
df = pd.DataFrame({
    "video_id": data["video_id"],
    "cleaned_transcript": [" ".join(tokens) for tokens in data["cleaned_transcript"]],
    "annotated_labels": data["annotated_labels"],
    "tfidf_scores": tfidf_scores,
    "generated_labels": [None] * len(data["video_id"])  # Placeholder for generated labels
})

# Save the table for further use
df.to_csv('preprocessed_data.csv', index=False)
print("Data saved successfully!")