In [1]:
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModel, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import time
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load data
test_text = pd.read_csv('text_pred.tsv', sep='\t', names=['text'])['text']
#test_labels = pd.read_csv('test_labels.tsv', sep='\t', names=['label'])['label']

# test = pd.read_csv('./Data_L/test_l.csv')
# test_text = test['text']
# test_labels = test['label']

# Load BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device("cpu")

# Tokenize and encode sequences
#print('tokenizing...')
max_length = 128
tokens_test = tokenizer.batch_encode_plus(test_text.tolist(), max_length=max_length, pad_to_max_length=True, truncation=True)
#print('finished tokenizing')

# Convert lists to tensors
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
#test_y = torch.tensor(test_labels.tolist())

# Model definition
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 3)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

model = BERT_Arch(bert)
model = model.to(device)

# Load best model and evaluate on test data
model.load_state_dict(torch.load('saved_weights.pt', map_location=torch.device('cpu')))
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)
print(preds[0])

ModuleNotFoundError: No module named 'torch'

In [2]:
import pyktok as pyk
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
import os
import fnmatch
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModel, BertTokenizerFast
import warnings
warnings.filterwarnings("ignore")

def extract_audio_from_video(video_path, audio_path):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path, codec='pcm_s16le')

def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    
    # Load audio file with pydub
    audio = AudioSegment.from_file(audio_path)
    
    # Export audio to wav format
    wav_path = "temp.wav"
    audio.export(wav_path, format="wav")
    
    with sr.AudioFile(wav_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data, language='en-EN')
        except sr.UnknownValueError:
            print(0)
        except sr.RequestError as e:
            text = f"Could not request results from Google Speech Recognition service; {e}"

    os.remove(wav_path)  # Clean up temporary file
    return text

def transcribe_video(video_path):
    audio_path = "temp_audio.wav"
    
    try:
        extract_audio_from_video(video_path, audio_path)
        transcription = transcribe_audio(audio_path)
        return transcription
    finally:
        if os.path.exists(audio_path):
            os.remove(audio_path)  # Clean up temporary file

tt_link = 'https://www.tiktok.com/@textplot/video/7395979047253200160?_r=1'

pyk.specify_browser('chrome')
pyk.save_tiktok(tt_link, True)

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*@*'):
        video_path = file

video_path = "test_2.mp4"
transcription = transcribe_video(video_path)
print("Transcription:", transcription)

# Load data
#test_text = pd.read_csv('text_pred.tsv', sep='\t', names=['text'])['text']
#test_labels = pd.read_csv('test_labels.tsv', sep='\t', names=['label'])['label']
test_text = [transcription]

# test = pd.read_csv('./Data_L/test_l.csv')
# test_text = test['text']
# test_labels = test['label']

# Load BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device("cpu")

# Tokenize and encode sequences
#print('tokenizing...')
max_length = 128
tokens_test = tokenizer.batch_encode_plus(test_text, max_length=max_length, pad_to_max_length=True, truncation=True)
#print('finished tokenizing')

# Convert lists to tensors
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
#test_y = torch.tensor(test_labels.tolist())

# Model definition
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 3)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

model = BERT_Arch(bert)
model = model.to(device)

# Load best model and evaluate on test data
model.load_state_dict(torch.load('../saved_weights.pt', map_location=torch.device('cpu')))
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)
print(preds[0])
print(transcription)

Saved video
 https://www.tiktok.com/@textplot/video/7395979047253200160?_r=1 
to
 c:\Users\pogiz\OneDrive\PC\Projects\2024_TikTokTechJam--blin--project\Prediction
MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Transcription: would you like to show your faces first off you called me sir right now what's your name Erica oh wait one more time I didn't catch that see you soon those are kids you knew exactly what they were doing no they're not ignorant rocked in their hateful there's a difference hate is learnt they deserve for that video to be seen by the world I would imagine their parents are at the same mind if my kid was walking around harassing trans people on the street I want them to know it's not a light thing it's not like oh they're just being kids know an opportunity for them to learn sure




0
would you like to show your faces first off you called me sir right now what's your name Erica oh wait one more time I didn't catch that see you soon those are kids you knew exactly what they were doing no they're not ignorant rocked in their hateful there's a difference hate is learnt they deserve for that video to be seen by the world I would imagine their parents are at the same mind if my kid was walking around harassing trans people on the street I want them to know it's not a light thing it's not like oh they're just being kids know an opportunity for them to learn sure


In [13]:
from TikTokApi import TikTokApi
import requests

def download_tiktok_video(tiktok_url, output_path):
    api = TikTokApi()
    
    # Extract the video ID from the TikTok URL
    video_id = tiktok_url.split('/video/')[1].split('?')[0]
    
    # Get the video data
    video_data = api.video(id=video_id)
    
    # Get the download URL
    download_url = video_data['video']['downloadAddr']
    
    # Download the video
    response = requests.get(download_url)
    
    with open(output_path, 'wb') as f:
        f.write(response.content)

# Example usage
tiktok_url = 'https://www.tiktok.com/@howridiculous/video/7395408759939435794'
output_path = 'downloaded_tiktok_video.mp4'
download_tiktok_video(tiktok_url, output_path)

print(f"Video downloaded and saved to {output_path}")

TypeError: 'Video' object is not subscriptable