In [1]:
import warnings
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
from scipy.spatial import distance
import glob
import os
import sys
import statistics
import spacy
import os
import re
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm

tqdm.pandas()

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

file = sys.argv[1]
nlp = spacy.load('fr_core_news_sm')
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(file)[1:]

In [9]:
####### NLP functions #######

tokenize = lambda x: nlp(x)[0]

def transcribe(id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(id,languages=['fr'])
    except Exception as e:
        try:
            transcript = YouTubeTranscriptApi.list_transcripts(id).find_transcript(['en']).translate('fr').fetch()
        except Exception as e:
            return 'error'
    return ' '.join([t['text'] for t in transcript])

def toList(text):
    return [ token.lemma_ 
            for token in map(tokenize,text.split()) 
            if (not token.is_stop) and token.is_alpha]

def avg(all_groups):
    return [statistics.mean(i) for i in zip(*all_groups)]

def variance(all_groups):
    return [statistics.variance(i) for i in zip(*all_groups)]

def listToAvgAndVar(l):
    docVectors = [nlp(word).vector.tolist() for word in l]
    return pd.DataFrame([avg(docVectors)+variance(docVectors)])

def subQuotes(s):
    return re.sub("[\(\[].*?[\)\]]", "", s)

filter_ = lambda word_list : ' '.join([word for word in word_list.split() if word not in stopwords.words('english')])

In [4]:
df['vid_id'] = df.videos.apply(lambda x: x.split('watch?v=')[-1])

In [5]:
df['rawText'] = df.vid_id.progress_apply(transcribe)

100%|███████████████████████████████████████| 1456/1456 [13:33<00:00,  1.79it/s]


In [6]:
df.rawText = df.rawText.apply(subQuotes)

In [7]:
df = df[df.rawText != 'error']

In [10]:
df['clean'] = df.rawText.progress_apply(filter_)

100%|███████████████████████████████████████| 1239/1239 [06:41<00:00,  3.09it/s]


In [11]:
df['text_word_count'] = df['rawText'].apply(lambda x: len(str(x).split()))

In [16]:
df.to_csv('./scripted/'+file.split('/')[-1])