We will try to use a dataset of TikTok videos and classify each as 'safe', or 'misinformation'.

In [10]:
import os

nest_asyncio.apply()

from google.cloud import videointelligence_v1 as videointelligence

def transcribe_video(video_path):
    # Initialize the Video Intelligence API client
    client = videointelligence.VideoIntelligenceServiceClient()

    # Read the file and load it into the API
    with open(video_path, "rb") as video_file:
        input_content = video_file.read()

    # Configure the request for speech transcription
    features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]
    config = videointelligence.SpeechTranscriptionConfig(
        language_code="en-US"  # Modify this if your video is in a different language
    )
    video_context = videointelligence.VideoContext(speech_transcription_config=config)

    # Execute the request
    operation = client.annotate_video(
        request={"features": features, "input_content": input_content, "video_context": video_context}
    )

    print("Processing video for transcription...")
    result = operation.result(timeout=300)  # Adjust timeout if needed

    # Process transcription results
    transcription_text = ""
    for annotation in result.annotation_results[0].speech_transcriptions:
        for alternative in annotation.alternatives:
            transcription_text += alternative.transcript + "\n"

    return transcription_text

In [15]:
import pandas as pd
import time

# Load your CSV file with columns "FileName" and "Label"
df = pd.read_csv("TikTokData.csv")  # Replace "your_file.csv" with the actual path to your CSV file

# Initialize an empty list to store the transcription results
transcriptions = []

# Loop through each row and get transcriptions
for index, row in df.iterrows():
    video_path = row["FileName"]
    print(f"Processing transcription for {video_path}...")

    # Get the transcription (this will wait for each transcription to complete)
    transcription = transcribe_video(video_path)

    time.sleep(100)
    
    # Append the transcription to the list
    transcriptions.append(transcription)

    # Add the "Transcription" column to the DataFrame
    df["Transcription"] = transcriptions
    
    # Save the updated DataFrame back to a CSV file
    df.to_csv("NewTikTokData.csv", index=False)  # Replace "updated_file.csv" with your desired output file name
    
    # Add a short sleep to avoid hitting API rate limits if needed (optional)
    time.sleep(1)

# Display the updated DataFrame
print(df)


Processing transcription for file1-Fake.mp4...


KeyboardInterrupt: 