In [1]:
import json
import os
import pandas as pd
import pymongo
import numpy as np

In [2]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

user='read-user'
password='read-user'

uri = f"mongodb+srv://{user}:{password}@nlp-recommend.nylbml2.mongodb.net/"

client = MongoClient(uri, server_api=ServerApi('1'))
                          
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [3]:
mydb = client["nlp-recommend"]
collection = mydb["TED-talks"]

In [4]:
collection.count_documents({})

6099

In [5]:
rawdb=pd.DataFrame(list(collection.find()))

In [6]:
rawdb.head()

Unnamed: 0,_id,preview,shortenedUrl,action,videoData,transcriptData,commentsEnabled,commentsLoggedInOnly,talk_id
0,652af294d95841780141bab7,False,https://go.ted.com/6Ryx,,"{'__typename': 'Video', 'id': '2147', 'slug': ...","{'translation': {'__typename': 'Translation', ...",False,False,2147
1,652af54dd95841780141bab8,False,https://go.ted.com/6sZX,,"{'__typename': 'Video', 'id': '2683', 'slug': ...","{'translation': {'__typename': 'Translation', ...",False,False,2683
2,652af54dd95841780141bab9,False,https://go.ted.com/6yKv,,"{'__typename': 'Video', 'id': '91525', 'slug':...","{'translation': {'__typename': 'Translation', ...",False,False,91525
3,652af54dd95841780141baba,False,https://go.ted.com/6RgH,,"{'__typename': 'Video', 'id': '101504', 'slug'...","{'translation': {'__typename': 'Translation', ...",True,True,101504
4,652af54dd95841780141babb,False,https://go.ted.com/6JLM,,"{'__typename': 'Video', 'id': '14610', 'slug':...","{'translation': {'__typename': 'Translation', ...",False,False,14610


### Data transforming: from json to dataframe columns

In [7]:
def json_flatten(data,col):
    db2=pd.DataFrame(data[col].tolist())
    namesList=[f"{col}-{col_changing}" for col_changing in db2.columns]
    db2.columns = namesList
    data=pd.concat([data.drop(col, axis=1), db2], axis=1)
    return data

In [133]:
#flattening first bunch of columns
transformed=rawdb.copy()
for col in ['videoData','videoData-type','transcriptData','transcriptData-video','transcriptData-video-talkExtras',
           'videoData-topics','videoData-talkExtras','videoData-relatedVideos','videoData-primaryImageSet',
            'videoData-customContentDetails','videoData-speakers','videoData-topics-nodes','videoData-speakers-nodes',
           'videoData-topics-nodes-0']:
    transformed=json_flatten(transformed,col)

In [134]:
# cocantenating the transcript
transformed = transformed.assign(transcript="")
for index, row in transformed.iterrows():
    text = ""
    if row['transcriptData-translation'] is not None:
        for paragraph in row['transcriptData-translation']['paragraphs']:
            for cue in paragraph["cues"]:
                text += " " + cue["text"]
        transformed.at[index, 'transcript'] = text
        
transformed['transcript'].loc[0]


' (Music) (Applause)'

In [135]:
# dropping columns
transformed=transformed.drop(['transcriptData-translation','videoData-commentsEnabled', 'videoData-commentsLoggedInOnly',
                               'videoData-id','transcriptData-video-id'], axis=1)

# fill in missing jsons and flatting the remaining columns
replace_dict={'__typename': None, 'id': None, 'name': None, 'slug': None}
for i in range(1,31):
    col='videoData-topics-nodes-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)

replace_dict={'__typename': None, 'slug': None, 'id': None}
for i in range(0,6):
    col='videoData-relatedVideos-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)
    
replace_dict={'__typename': None, 'url': None, 'aspectRatioName': None}
for i in range(0,5):
    col='videoData-primaryImageSet-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)
    
replace_dict={'__typename': None, 'photoUrl': None, 'firstname': None, 'middlename': None, 'lastname': None, 'description': None,
 'isLive': False, 'title': None, 'whatOthersSay': None, 'whoTheyAre': None, 'whyListen': None, 'slug': None}
for i in range(0,16):
    col='videoData-speakers-nodes-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)

#extracting the external information from videoData-playerData
transformed = transformed.assign(external={})
col='external'
for index, row in transformed.iterrows():
    x=json.loads(row['videoData-playerData'])
    if col in x:
        transformed.at[index, col] = x[col]
        
    else:
        transformed.at[index, col] ={'service': None, 'code': None, 'duration': None, 'start_time': None}
        
transformed=json_flatten(transformed,col)

transformed=transformed.drop(['videoData-playerData'], axis=1)



pd.set_option('display.max_rows', transformed.shape[0]+1)
transformed.loc[0]

_id                                                                     652af294d95841780141bab7
preview                                                                                    False
shortenedUrl                                                             https://go.ted.com/6Ryx
action                                                                                      None
commentsEnabled                                                                            False
commentsLoggedInOnly                                                                       False
talk_id                                                                                     2147
videoData-__typename                                                                       Video
videoData-slug                                 aakash_odedra_a_dance_in_a_hurricane_of_paper_...
videoData-title                                  A dance in a hurricane of paper, wind and light
videoData-socialTitle         

### Data cleaning

In [65]:
transformed.shape

(6099, 394)

In [178]:
final_df=transformed.copy()

In [179]:
chars=['\n','\r']
cols=['videoData-speakers-nodes-0-whoTheyAre','videoData-description','videoData-socialDescription','transcript',
      'videoData-speakers-nodes-0-whatOthersSay','videoData-speakers-nodes-1-whoTheyAre']
for col in cols:
    for ch in chars:
        final_df[col] = final_df[col].replace(ch,' ',regex=True)

In [180]:
for col in final_df.columns:
    nas=final_df[col].isna().sum()
    if nas>5000:
        print(f'{col} has {nas} NAs/6099')
        final_df=final_df.drop([col], axis=1)
        
#dropping columns with more than 5000 missing values        
    

action has 6099 NAs/6099
videoData-partnerName has 5527 NAs/6099
videoData-customContentDetails-partnerName has 6087 NAs/6099
videoData-topics-nodes-10-__typename has 5307 NAs/6099
videoData-topics-nodes-10-id has 5307 NAs/6099
videoData-topics-nodes-10-name has 5307 NAs/6099
videoData-topics-nodes-10-slug has 5307 NAs/6099
videoData-topics-nodes-11-__typename has 5537 NAs/6099
videoData-topics-nodes-11-id has 5537 NAs/6099
videoData-topics-nodes-11-name has 5537 NAs/6099
videoData-topics-nodes-11-slug has 5537 NAs/6099
videoData-topics-nodes-12-__typename has 5690 NAs/6099
videoData-topics-nodes-12-id has 5690 NAs/6099
videoData-topics-nodes-12-name has 5690 NAs/6099
videoData-topics-nodes-12-slug has 5690 NAs/6099
videoData-topics-nodes-13-__typename has 5805 NAs/6099
videoData-topics-nodes-13-id has 5805 NAs/6099
videoData-topics-nodes-13-name has 5805 NAs/6099
videoData-topics-nodes-13-slug has 5805 NAs/6099
videoData-topics-nodes-14-__typename has 5887 NAs/6099
videoData-topics-no

videoData-speakers-nodes-7-description has 6094 NAs/6099
videoData-speakers-nodes-7-title has 6094 NAs/6099
videoData-speakers-nodes-7-whatOthersSay has 6094 NAs/6099
videoData-speakers-nodes-7-whoTheyAre has 6094 NAs/6099
videoData-speakers-nodes-7-whyListen has 6094 NAs/6099
videoData-speakers-nodes-7-slug has 6094 NAs/6099
videoData-speakers-nodes-8-__typename has 6094 NAs/6099
videoData-speakers-nodes-8-photoUrl has 6094 NAs/6099
videoData-speakers-nodes-8-firstname has 6094 NAs/6099
videoData-speakers-nodes-8-middlename has 6094 NAs/6099
videoData-speakers-nodes-8-lastname has 6094 NAs/6099
videoData-speakers-nodes-8-description has 6094 NAs/6099
videoData-speakers-nodes-8-title has 6094 NAs/6099
videoData-speakers-nodes-8-whatOthersSay has 6094 NAs/6099
videoData-speakers-nodes-8-whoTheyAre has 6094 NAs/6099
videoData-speakers-nodes-8-whyListen has 6094 NAs/6099
videoData-speakers-nodes-8-slug has 6094 NAs/6099
videoData-speakers-nodes-9-__typename has 6094 NAs/6099
videoData-spe

In [181]:
#finding duplicated columns
from itertools import combinations

[(i, j) for i,j in combinations(final_df, 2) if final_df[i].equals(final_df[j])]

[('preview', 'videoData-speakers-nodes-15-isLive'),
 ('videoData-__typename', 'transcriptData-video-__typename'),
 ('videoData-internalLanguageCode', 'videoData-language'),
 ('transcriptData-video-talkExtras-__typename',
  'videoData-talkExtras-__typename'),
 ('videoData-speakers-nodes-6-isLive', 'videoData-speakers-nodes-7-isLive'),
 ('videoData-speakers-nodes-6-isLive', 'videoData-speakers-nodes-8-isLive'),
 ('videoData-speakers-nodes-7-isLive', 'videoData-speakers-nodes-8-isLive'),
 ('videoData-speakers-nodes-10-isLive', 'videoData-speakers-nodes-12-isLive'),
 ('videoData-speakers-nodes-13-isLive', 'videoData-speakers-nodes-14-isLive')]

In [182]:
# dropping first list of redundant columns
drop1=['videoData-__typename', 'transcriptData-video-__typename','transcriptData-video-talkExtras-__typename', 
       'videoData-internalLanguageCode','videoData-relatedVideos-5-__typename','videoData-relatedVideos-4-__typename',
       'videoData-relatedVideos-3-__typename','videoData-relatedVideos-2-__typename','videoData-relatedVideos-1-__typename',
       'videoData-topics-nodes-1-__typename','videoData-topics-nodes-2-__typename','videoData-topics-nodes-3-__typename',
       'videoData-topics-nodes-4-__typename','videoData-topics-nodes-5-__typename','videoData-topics-nodes-6-__typename',
       'videoData-topics-nodes-7-__typename','videoData-topics-nodes-8-__typename','videoData-topics-nodes-9-__typename',
       'videoData-customContentDetails-__typename','videoData-talkExtras-__typename']
for col in drop1:
    final_df=final_df.drop([col], axis=1)

In [183]:
final_df.shape


(6099, 116)

In [184]:
final_df.loc[0]

_id                                                                     652af294d95841780141bab7
preview                                                                                    False
shortenedUrl                                                             https://go.ted.com/6Ryx
commentsEnabled                                                                            False
commentsLoggedInOnly                                                                       False
talk_id                                                                                     2147
videoData-slug                                 aakash_odedra_a_dance_in_a_hurricane_of_paper_...
videoData-title                                  A dance in a hurricane of paper, wind and light
videoData-socialTitle                            A dance in a hurricane of paper, wind and light
videoData-presenterDisplayName                                                     Aakash Odedra
videoData-recordedOn          

In [185]:
final_df.to_csv('data.csv')

In [101]:

final_df['videoData-talkExtras-__typename'].unique()

In [105]:
final_df.loc[2]

_id                                                                     652af54dd95841780141bab9
preview                                                                                    False
shortenedUrl                                                             https://go.ted.com/6yKv
commentsEnabled                                                                            False
commentsLoggedInOnly                                                                       False
talk_id                                                                                    91525
videoData-slug                                 aarathi_krishnan_5_ethical_principles_for_digi...
videoData-title                                5 ethical principles for digitizing humanitari...
videoData-socialTitle                                                                           
videoData-presenterDisplayName                                                  Aarathi Krishnan
videoData-recordedOn          

In [106]:
final_df[''].unique()

#videoData-type-__typename

array(['TypeOfVideo'], dtype=object)

In [186]:
type(final_df['transcriptData-video-talkExtras-footnotes'][1][0])

dict

In [81]:
final_df['videoData-talkExtras-recommendations'][2]

[{'__typename': 'Recommendation',
  'blurb': 'More resources curated by Aarathi Krishnan',
  'recLists': [{'__typename': 'RecommendationList',
    'title': '',
    'description': '',
    'recItems': [{'__typename': 'RecommendationItem',
      'blurb': '',
      'eyebrow': '',
      'headline': '"Humanitarian Digital Ethics: A Foresight and Decolonial Governance Approach"',
      'isPdf': False,
      'label': 'READ_ARTICLE',
      'linkUrl': 'https://carrcenter.hks.harvard.edu/publications/humanitarian-digital-ethics',
      'note': 'Aarathi Krishnan\r\nCarr Center Discussion Paper Series, 2022'},
     {'__typename': 'RecommendationItem',
      'blurb': '',
      'eyebrow': '',
      'headline': '"Unsettling the Coloniality of Foresight" chapter from *Sacred Civics*',
      'isPdf': False,
      'label': 'READ_BOOK',
      'linkUrl': 'https://www.taylorfrancis.com/chapters/oa-edit/10.4324/9781003199816-10/unsettling-coloniality-foresight-aarathi-krishnan?context=ubx&refId=ac9b2ab9-a7e8

In [78]:
#final_df['transcriptData-video-talkExtras-__typename'].unique()

final_df[['transcriptData-video-talkExtras-__typename','videoData-talkExtras-__typename']]


transcriptData-video-talkExtras-footnotes                                                     []
videoData-topics-__typename                                                      TopicConnection
videoData-talkExtras-__typename                                                       TalkExtras
videoData-talkExtras-recommendations                                                          []
videoData-talkExtras-takeAction                                                               []
videoData-talkExtras-learnModules                                                             []

Unnamed: 0,transcriptData-video-talkExtras-__typename,videoData-talkExtras-__typename
0,TalkExtras,TalkExtras
1,TalkExtras,TalkExtras
2,TalkExtras,TalkExtras
3,TalkExtras,TalkExtras
4,TalkExtras,TalkExtras
5,TalkExtras,TalkExtras
6,TalkExtras,TalkExtras
7,TalkExtras,TalkExtras
8,TalkExtras,TalkExtras
9,TalkExtras,TalkExtras


In [None]:

final_df[['videoData-__typename']].nunique()

In [None]:
final_df=transformed.copy()

#final_df = final_df.assign(external={})
#col='external'
for index, row in final_df.iterrows():
    col='transcriptData-video-talkExtras-footnotes'
    if len(row['transcriptData-video-talkExtras-footnotes'])>0
    for index, el in enumerate(row[]):
    x=json.loads(row['videoData-playerData'])
    if col in x:
        transformed.at[index, col] = x[col]
        
    else:
        transformed.at[index, col] ={'service': None, 'code': None, 'duration': None, 'start_time': None}

In [None]:
for x in collection.find(): 
    print(x)
path_to_slug = 'slug/'


slugs_combined=[]

for sfile in slug_files:
    print(sfile)
    with open(os.path.join(path_to_slug, sfile)) as slug_file:
        loaded=slug_file.read().splitlines()
        print(f"{len(loaded)} slugs found")
        slugs_combined.append(loaded)


#pd.DataFrame.from_dict()

data1=pd.json_normalize(data['pageProps']['videoData'])

pd.concat([data1.drop('speakers.nodes', axis=1), pd.DataFrame(data1['speakers.nodes'].tolist())], axis=1)


pd.json_normalize(data1['speakers.nodes'][0])


data['pageProps']['videoData']


pd.json_normalize(data1['customContentDetails.partnerName'])


# function to parse data and concat all cues to get the transcript

def getTextFromSlug(talkData):
    text = ""
    for paragraph in talkData["pageProps"]["transcriptData"]["translation"]["paragraphs"]:
        for cue in paragraph["cues"]:
            text+=" "+cue["text"]
    return text


#data_init=pd.json_normalize(data_json['pageProps']['videoData'])