We will try to use a dataset of TikTok videos and classify each as 'safe', or 'misinformation'.

In [17]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/teo/Downloads/psyched-freedom-182221-658209fdfd55.json"

import time
from google.cloud import videointelligence_v1 as videointelligence



def transcribe_video(file_path):
    # Initialize the Video Intelligence API client
    client = videointelligence.VideoIntelligenceServiceClient()

    # Read the file and load it into the API
    with open(file_path, "rb") as video_file:
        input_content = video_file.read()

    # Configure the request for speech transcription
    features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]
    config = videointelligence.SpeechTranscriptionConfig(
        language_code="en-US"  # Adjust language if needed
    )
    video_context = videointelligence.VideoContext(speech_transcription_config=config)

    # Start the operation
    operation = client.annotate_video(
        request={"features": features, "input_content": input_content, "video_context": video_context}
    )

    print(f"Started transcription for {file_path}... Waiting for completion.")

    # Poll the operation status until it completes
    while not operation.done():
        print("Waiting for transcription to finish...")
        time.sleep(10)  # Wait 10 seconds between checks

    # Process results after the operation is done
    if operation.result():
        result = operation.result()
        transcription_text = ""
        for annotation in result.annotation_results[0].speech_transcriptions:
            for alternative in annotation.alternatives:
                transcription_text += alternative.transcript + " "
        return transcription_text.strip()
    else:
        print(f"Transcription failed for {file_path}")
        return ""


In [25]:
import pandas as pd
import time

# Load your CSV file with columns "FileName" and "Label"
df = pd.read_csv("TikTokData.csv")  # Replace "your_file.csv" with the actual path to your CSV file

# Initialize an empty list to store the transcription results
transcriptions = []
ctr = 0

# Loop through each row and get transcriptions
for index, row in df.iterrows():
    ctr = ctr+1
    if ctr <= 9:
        continue
    
    video_path = row["FileName"]
    print(f"Processing transcription for {video_path}...")

    # Get the transcription (this will wait for each transcription to complete)
    transcription = transcribe_video(video_path)
    
    # Append the transcription to the list
    transcriptions.append(transcription)
    
    # Add a short sleep to avoid hitting API rate limits if needed (optional)
    time.sleep(1)

Processing transcription for file10-Fake.mp4...
Started transcription for file10-Fake.mp4... Waiting for completion.
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Processing transcription for file11-Fake.mp4...
Started transcription for file11-Fake.mp4... Waiting for completion.
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Processing transcription for file12-Real.mp4...
Started transcription for file12-Real.mp4... Waiting for completion.
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Waiting for transcription to finish...
Processing transcription for file13-Safe.mp4...
Started transcription for file13-Safe.mp4... Waiting for completion.
Waiting for transcription

In [21]:
transcriptions

['news Disneyworld officially removed the drinking age in their Parks this means that anyone regardless of age can legally drink at Disney World this comes after a year-long battle with the state of Florida to get a resort exemption originally Disney was attempting to lower the drinking age to 18 however they ended up removing the drinking age all together Disney is doing this to make more money but it is proving to be very controversial the biggest concern is over people drunk driving but Disney has an ingenious solution to prevent that they will require guests under 21 to wear a special magic band that can only be removed by  nikesh members at the park exit however in order to get the Magic Band removed you must have a blood alcohol level below .02 if someone is unable to blow less than 0.02 Disney will take the guests home in a minivan however according to an official spokesperson guess who need the money man service should expect prices to be double or triple what an Uber or Lyft w

In [26]:
for _ in range(9):
    transcriptions.append(None)  # or np.nan for NaN values

# Add the "Transcription" column to the DataFrame
df["Transcription"] = transcriptions

# Save the updated DataFrame back to a CSV file
df.to_csv("NewTikTokData2.csv", index=False)  # Replace "updated_file.csv" with your desired output file name

# Display the updated DataFrame
print(df)


           FileName Label                                      Transcription
0    file1-Fake.mp4  fake  used to be a cast member at DisneyWorld doing ...
1    file2-Fake.mp4  fake  your eyes start shaking suddenly throw your ph...
2    file3-Fake.mp4  fake  elevator wire is broken and you are inside in ...
3    file4-Fake.mp4  fake  damn minute I'm finna bust your ass not sick u...
4    file5-Fake.mp4  fake  yeah we're to flip all you how I mean Hawaii R...
5    file6-Real.mp4  real  I told you to ground it what now give me your ...
6    file7-Fake.mp4  fake  this tiny bedroom barely fits a bed leaving no...
7    file8-Fake.mp4  fake                                               None
8    file9-Fake.mp4  fake                                               None
9   file10-Fake.mp4  fake                                               None
10  file11-Fake.mp4  fake                                               None
11  file12-Real.mp4  real                                               None