In [1]:
import pandas as pd

def define_category(row,code):
    # get the category IDs corresponding to the country code
    categories_json = pd.read_json('Json_Categories/'+code+'_category_id.json')
    categories_json = categories_json["items"]
    dict_categories = {element["id"]:element["snippet"]["title"] for element in categories_json}
    categoryId = int(row["category_id"])
    categoryId = str(categoryId)
    if categoryId in dict_categories:
        return dict_categories[categoryId]

def merge_video_infos(country_codes, folder):    
    """Function that merges the files of different countries, adding columns 
    for the video category and the country code"""
    video_list = []
    for code in country_codes:
        # read CSV corresponding to the country code
        video_info = pd.read_csv(folder+code+'.csv')
        
        #remove rows with no categoryID and assigns the category to the remaining
        video_info = video_info[video_info['category_id'].notna()]
        video_info["category"] = video_info.apply(define_category,axis=1,args=(code,))
        
        # assigns the country code
        video_info["country"] = code
        
        video_list.append(video_info)
        print(code+" merged")
    
    video_info_merged = pd.concat(video_list).reset_index()
    video_info_merged = video_info_merged.drop(["index","description"], axis=1)
    return video_info_merged


In [3]:
country_codes=["DE","GB","FR","US"]
folder = "Kaggle_DataSet/Kaggle_"

#pd.set_option('display.max_rows', video_info_merged.shape[0]+1)

video_info_merged = merge_video_infos(country_codes, folder)
video_info_merged.head()

DE merged
GB merged
FR merged
US merged


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,category,country
0,LgVi6y5QIjM,17.14.11,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,24,2017-11-13T17:08:49.000Z,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Entertainment,DE
1,Bayt7uQith4,17.14.11,Kinder ferngesteuert im Kiosk! Erwachsene abzo...,LUKE! Die Woche und ich,23,2017-11-12T22:30:01.000Z,"Kinder|""ferngesteuert""|""Kinder ferngesteuert""|...",797196,53576,302,1278,https://i.ytimg.com/vi/Bayt7uQith4/default.jpg,False,False,False,Comedy,DE
2,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,Entertainment,DE
3,AHtypnRk7JE,17.14.11,Das Fermi-Paradoxon,100SekundenPhysik,27,2017-11-12T15:00:01.000Z,"Physik|""Wissenschaft""|""Technik""|""Science-Ficti...",380247,31821,458,1955,https://i.ytimg.com/vi/AHtypnRk7JE/default.jpg,False,False,False,Education,DE
4,ZJ9We4bjcg0,17.14.11,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,24,2017-11-12T13:10:36.000Z,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,Entertainment,DE


In [4]:
video_info_merged.to_csv("Videos_merged.csv", index=False)

In [5]:
# merge data from the recent dataset

country_codes=["PT","ES","KE","NP","BR"]
folder = "Recent_DataSet/Recent_"

video_info_merged = merge_video_infos(country_codes, folder)

PT merged
ES merged
KE merged
NP merged
BR merged


In [6]:
video_info_merged.to_csv("Recent_videos_merged.csv", index=False)