# Raw Data Wrangling
- unpacking json files
- removing redundand columns
- removing new line characters from the transcript to allow exporting as a csv file

In [1]:
import json
import os
import pandas as pd
import pymongo
import numpy as np

### Downloding data from MongoDB
As an alternative you can load the json files individually

In [2]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

user='read-user'
password='read-user'

uri = f"mongodb+srv://{user}:{password}@nlp-recommend.nylbml2.mongodb.net/"

client = MongoClient(uri, server_api=ServerApi('1'))
                          
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [3]:
mydb = client["nlp-recommend"]
collection = mydb["TED-talks"]
collection.count_documents({})

6099

In [4]:
rawdb=pd.DataFrame(list(collection.find()))

In [5]:
rawdb.head()

Unnamed: 0,_id,preview,shortenedUrl,action,videoData,transcriptData,commentsEnabled,commentsLoggedInOnly,talk_id
0,652af294d95841780141bab7,False,https://go.ted.com/6Ryx,,"{'__typename': 'Video', 'id': '2147', 'slug': ...","{'translation': {'__typename': 'Translation', ...",False,False,2147
1,652af54dd95841780141bab8,False,https://go.ted.com/6sZX,,"{'__typename': 'Video', 'id': '2683', 'slug': ...","{'translation': {'__typename': 'Translation', ...",False,False,2683
2,652af54dd95841780141bab9,False,https://go.ted.com/6yKv,,"{'__typename': 'Video', 'id': '91525', 'slug':...","{'translation': {'__typename': 'Translation', ...",False,False,91525
3,652af54dd95841780141baba,False,https://go.ted.com/6RgH,,"{'__typename': 'Video', 'id': '101504', 'slug'...","{'translation': {'__typename': 'Translation', ...",True,True,101504
4,652af54dd95841780141babb,False,https://go.ted.com/6JLM,,"{'__typename': 'Video', 'id': '14610', 'slug':...","{'translation': {'__typename': 'Translation', ...",False,False,14610


### Data transforming: from json to dataframe columns

In [42]:
#Function to flatten jsons
def json_flatten(data,col):
    db2=pd.DataFrame(data[col].tolist())
    namesList=[f"{col}-{col_changing}" for col_changing in db2.columns]
    db2.columns = namesList
    data=pd.concat([data.drop(col, axis=1), db2], axis=1)
    return data

In [43]:
#flattening first bunch of columns
transformed=rawdb.copy()
for col in ['videoData','videoData-type','transcriptData','transcriptData-video','transcriptData-video-talkExtras',
           'videoData-topics','videoData-talkExtras','videoData-relatedVideos','videoData-primaryImageSet',
            'videoData-customContentDetails','videoData-speakers','videoData-topics-nodes','videoData-speakers-nodes',
           'videoData-topics-nodes-0']:
    transformed=json_flatten(transformed,col)

In [44]:
# concatenating the transcript
transformed = transformed.assign(transcript="")
transformed = transformed.assign(transcript_language="")
for index, row in transformed.iterrows():
    text = ""
    language=""
    if row['transcriptData-translation'] is not None:
        language=row['transcriptData-translation']['language']['englishName']
        transformed.at[index, 'transcript_language'] = language
        for paragraph in row['transcriptData-translation']['paragraphs']:
            for cue in paragraph["cues"]:
                text += " " + cue["text"]
        transformed.at[index, 'transcript'] = text
        
transformed['transcript'].loc[0]


' (Music) (Applause)'

In [45]:
# dropping columns
transformed=transformed.drop(['transcriptData-translation','videoData-commentsEnabled', 'videoData-commentsLoggedInOnly',
                               'videoData-id','transcriptData-video-id'], axis=1)

# fill in missing jsons and flatting the remaining columns
replace_dict={'__typename': None, 'id': None, 'name': None, 'slug': None}
for i in range(1,31):
    col='videoData-topics-nodes-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)

replace_dict={'__typename': None, 'slug': None, 'id': None}
for i in range(0,6):
    col='videoData-relatedVideos-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)
    
replace_dict={'__typename': None, 'url': None, 'aspectRatioName': None}
for i in range(0,5):
    col='videoData-primaryImageSet-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)
    
replace_dict={'__typename': None, 'photoUrl': None, 'firstname': None, 'middlename': None, 'lastname': None, 'description': None,
 'isLive': False, 'title': None, 'whatOthersSay': None, 'whoTheyAre': None, 'whyListen': None, 'slug': None}
for i in range(0,16):
    col='videoData-speakers-nodes-'+str(i)
    transformed[col] = transformed[col].apply(lambda x: replace_dict if pd.isna(x) else x)
    transformed=json_flatten(transformed,col)

#extracting the external information from videoData-playerData
transformed = transformed.assign(external={})
col='external'
for index, row in transformed.iterrows():
    x=json.loads(row['videoData-playerData'])
    if col in x:
        transformed.at[index, col] = x[col]
        
    else:
        transformed.at[index, col] ={'service': None, 'code': None, 'duration': None, 'start_time': None}
        
transformed=json_flatten(transformed,col)

transformed=transformed.drop(['videoData-playerData'], axis=1)



pd.set_option('display.max_rows', transformed.shape[0]+1)
transformed.loc[0]

_id                                                                     652af294d95841780141bab7
preview                                                                                    False
shortenedUrl                                                             https://go.ted.com/6Ryx
action                                                                                      None
commentsEnabled                                                                            False
commentsLoggedInOnly                                                                       False
talk_id                                                                                     2147
videoData-__typename                                                                       Video
videoData-slug                                 aakash_odedra_a_dance_in_a_hurricane_of_paper_...
videoData-title                                  A dance in a hurricane of paper, wind and light
videoData-socialTitle         

### Data cleaning

In [46]:
#current shape
transformed.shape

(6099, 395)

In [47]:
final_df=transformed.copy()


In [48]:
# removing new line characters
chars=['\n','\r']
cols=['videoData-speakers-nodes-0-whoTheyAre','videoData-description','videoData-socialDescription','transcript',
      'videoData-speakers-nodes-0-whatOthersSay','videoData-speakers-nodes-1-whoTheyAre']
for col in cols:
    for ch in chars:
        final_df[col] = final_df[col].replace(ch,' ',regex=True)

In [49]:
#dropping columns with more than 5000 missing values  
for col in final_df.columns:
    nas=final_df[col].isna().sum()
    if nas>5000:
        print(f'{col} has {nas} NAs/6099')
        final_df=final_df.drop([col], axis=1)
        
     

action has 6099 NAs/6099
videoData-partnerName has 5527 NAs/6099
videoData-customContentDetails-partnerName has 6087 NAs/6099
videoData-topics-nodes-10-__typename has 5307 NAs/6099
videoData-topics-nodes-10-id has 5307 NAs/6099
videoData-topics-nodes-10-name has 5307 NAs/6099
videoData-topics-nodes-10-slug has 5307 NAs/6099
videoData-topics-nodes-11-__typename has 5537 NAs/6099
videoData-topics-nodes-11-id has 5537 NAs/6099
videoData-topics-nodes-11-name has 5537 NAs/6099
videoData-topics-nodes-11-slug has 5537 NAs/6099
videoData-topics-nodes-12-__typename has 5690 NAs/6099
videoData-topics-nodes-12-id has 5690 NAs/6099
videoData-topics-nodes-12-name has 5690 NAs/6099
videoData-topics-nodes-12-slug has 5690 NAs/6099
videoData-topics-nodes-13-__typename has 5805 NAs/6099
videoData-topics-nodes-13-id has 5805 NAs/6099
videoData-topics-nodes-13-name has 5805 NAs/6099
videoData-topics-nodes-13-slug has 5805 NAs/6099
videoData-topics-nodes-14-__typename has 5887 NAs/6099
videoData-topics-no

videoData-speakers-nodes-7-whoTheyAre has 6094 NAs/6099
videoData-speakers-nodes-7-whyListen has 6094 NAs/6099
videoData-speakers-nodes-7-slug has 6094 NAs/6099
videoData-speakers-nodes-8-__typename has 6094 NAs/6099
videoData-speakers-nodes-8-photoUrl has 6094 NAs/6099
videoData-speakers-nodes-8-firstname has 6094 NAs/6099
videoData-speakers-nodes-8-middlename has 6094 NAs/6099
videoData-speakers-nodes-8-lastname has 6094 NAs/6099
videoData-speakers-nodes-8-description has 6094 NAs/6099
videoData-speakers-nodes-8-title has 6094 NAs/6099
videoData-speakers-nodes-8-whatOthersSay has 6094 NAs/6099
videoData-speakers-nodes-8-whoTheyAre has 6094 NAs/6099
videoData-speakers-nodes-8-whyListen has 6094 NAs/6099
videoData-speakers-nodes-8-slug has 6094 NAs/6099
videoData-speakers-nodes-9-__typename has 6094 NAs/6099
videoData-speakers-nodes-9-photoUrl has 6094 NAs/6099
videoData-speakers-nodes-9-firstname has 6094 NAs/6099
videoData-speakers-nodes-9-middlename has 6094 NAs/6099
videoData-speak

In [50]:
#finding duplicated columns or columns with same data
from itertools import combinations

[(i, j) for i,j in combinations(final_df, 2) if final_df[i].equals(final_df[j])]

[('preview', 'videoData-speakers-nodes-15-isLive'),
 ('videoData-__typename', 'transcriptData-video-__typename'),
 ('videoData-internalLanguageCode', 'videoData-language'),
 ('transcriptData-video-talkExtras-__typename',
  'videoData-talkExtras-__typename'),
 ('videoData-speakers-nodes-6-isLive', 'videoData-speakers-nodes-7-isLive'),
 ('videoData-speakers-nodes-6-isLive', 'videoData-speakers-nodes-8-isLive'),
 ('videoData-speakers-nodes-7-isLive', 'videoData-speakers-nodes-8-isLive'),
 ('videoData-speakers-nodes-10-isLive', 'videoData-speakers-nodes-12-isLive'),
 ('videoData-speakers-nodes-13-isLive', 'videoData-speakers-nodes-14-isLive')]

In [51]:
# dropping first list of redundant columns
drop1=['videoData-__typename', 'transcriptData-video-__typename','transcriptData-video-talkExtras-__typename', 
       'videoData-internalLanguageCode','videoData-relatedVideos-5-__typename','videoData-relatedVideos-4-__typename',
       'videoData-relatedVideos-3-__typename','videoData-relatedVideos-2-__typename','videoData-relatedVideos-1-__typename',
       'videoData-topics-nodes-1-__typename','videoData-topics-nodes-2-__typename','videoData-topics-nodes-3-__typename',
       'videoData-topics-nodes-4-__typename','videoData-topics-nodes-5-__typename','videoData-topics-nodes-6-__typename',
       'videoData-topics-nodes-7-__typename','videoData-topics-nodes-8-__typename','videoData-topics-nodes-9-__typename',
       'videoData-customContentDetails-__typename','videoData-talkExtras-__typename']
for col in drop1:
    final_df=final_df.drop([col], axis=1)

In [18]:
final_df.shape

(6099, 116)

In [19]:
final_df.loc[0]

_id                                                                     652af294d95841780141bab7
preview                                                                                    False
shortenedUrl                                                             https://go.ted.com/6Ryx
commentsEnabled                                                                            False
commentsLoggedInOnly                                                                       False
talk_id                                                                                     2147
videoData-slug                                 aakash_odedra_a_dance_in_a_hurricane_of_paper_...
videoData-title                                  A dance in a hurricane of paper, wind and light
videoData-socialTitle                            A dance in a hurricane of paper, wind and light
videoData-presenterDisplayName                                                     Aakash Odedra
videoData-recordedOn          

#### Remaining to unpack: 
- transcriptData-video-talkExtras-footnotes
- videoData-talkExtras-recommendations
- videoData-talkExtras-takeAction
- videoData-talkExtras-learnModules 

#### Steps to do:
- renaming columns,
- dropping redundant columns

In [20]:
final_df.to_csv('dataset_v22-10-2023.csv')

In [22]:
final_df['transcript'].loc[271]

' Την παρουσίαση που έκανα εδώ πριν δύο χρόνια την έχω κάνει άλλες 2.000 φορές. Σήμερα το πρωί θα σας κάνω μια μικρή παρουσίαση που κάνω για πρώτη φορά, έτσι.. λοιπόν είναι  - δεν επιθυμώ να ανεβάσω τον πήχη αντιθέτως προσπαθώ να τον χαμηλώσω Διότι ετοίμασα αυτή την παρουσίαση, προκειμένου να αντιμετωπίσω  την πρόκληση αυτού του συνεδρίου Και η φανταστική παρουσίαση  της Κάρεν Άρμστρονγκ μου θύμησε ότι η θρησκεία όταν  την αντιλαμβανόμαστε σωστά δεν έχει να κάνει με την πίστη,  αλλά με τη συμπεριφορά. Πιθανώς θα πρέπει να λέμε το ίδιο  σχετικά με την έννοια της αισιοδοξίας Πώς τολμούμε να είμαστε αισιόδοξοι; Η αισιοδοξία μερικές φορές χαρακτηρίζεται  ως μία πίστη, μία διανοητική κατάσταση Όπως ο Μαχάτμα Γκάντι συνήθιζε να λέει «Πρέπει να γίνεις ο ίδιος η αλλαγή  που θέλεις να δεις στον κόσμο» Και το αποτέλεσμα για το οποίο θέλουμε να είμαστε αισιόδοξοι  δεν θα δημιουργηθεί από την πίστη και μόνο, εκτός  εάν αν βρεθούμε στο σημείο όπου η πίστη θα φέρει μία νέου είδους συμπεριφορά.  Αλλά