In [1]:
import math
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({"font.size": 18,
                     "font.family": "sans-serif",
                     "figure.figsize": (20, 8),
                     "axes.facecolor": "ffffff",
                     "figure.dpi"       : 200,
                     "legend.fontsize"  : "large",
                     "figure.titlesize" : "medium",
                     "lines.linewidth": 3,
})

In [2]:
video_analysis_file = "data/[MUSE India] [RP Outputs] - Muse_India_Study_yt_local.csv.csv"
languages = ["Bengali", "Hindi", "Kannada", "Tamil", "Telugu"]
language_analyis_files = [
    "data/[MUSE India] [Final] Language Analysis Results - bn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - hi.csv",
    "data/[MUSE India] [Final] Language Analysis Results - kn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - ta.csv",
    "data/[MUSE India] [Final] Language Analysis Results - te.csv"
    ]

In [3]:
video_analysis_df = pd.read_csv(video_analysis_file, index_col=None)
language_analyis_dfs = [pd.read_csv(language_analyis_file, index_col=None) 
                        for language_analyis_file in language_analyis_files]
language_analyis_df = pd.concat(language_analyis_dfs)

In [4]:
video_analysis_df.head()

Unnamed: 0,Year,Rank,Program name,Channel,Program Theme,Program Genre,Programme Language,# of episodes,rat%/AP,Daily Avg Rch%,...,"[60, inf)",masculine_faces,feminine_faces,age_1_screen_time,age_2_screen_time,age_3_screen_time,age_4_screen_time,"(female, [60, inf), [8.1, 9.1))","(male, [0, 18), [9.1, 10.1))","(female, [0, 18), [9.1, 10.1))"
0,2018,13,ADORINI,STAR Jalsha,DRAMA/SOAP,SERIALS,BENGALI,220,2.48,5.0,...,2,4641,12023,0.009482,0.881901,0.108497,0.00012,,,
1,2018,13,ADORINI,STAR Jalsha,DRAMA/SOAP,SERIALS,BENGALI,220,2.48,5.0,...,0,7448,4741,0.006727,0.940192,0.053081,0.0,,,
2,2018,13,ADORINI,STAR Jalsha,DRAMA/SOAP,SERIALS,BENGALI,220,2.48,5.0,...,0,1383,3816,0.013656,0.969417,0.016926,0.0,,,
3,2018,13,ADORINI,STAR Jalsha,DRAMA/SOAP,SERIALS,BENGALI,220,2.48,5.0,...,1412,4003,10725,0.015888,0.776888,0.111353,0.095872,,,
4,2018,13,ADORINI,STAR Jalsha,DRAMA/SOAP,SERIALS,BENGALI,220,2.48,5.0,...,15,6790,16032,0.009684,0.852073,0.137587,0.000657,,,


In [5]:
language_analyis_df.head()

Unnamed: 0,Language,Video ID,Controversial Topics (LLM),Derogatory Words (LLM),Derogatory Words (dictionary),Person names,Transcript word count,Transcript unique word count,Transcript non-stopword count,Transcript unique non-stop word count
0,bn,-PUpZlgFVlI,,,,"[('আমার', 'hindu', 'male'), ('ভাই', 'hindu', '...",36,23,9,7
1,bn,-R6Iou8twaw,,,,"[('నీ అభిలాషలో', 'hindu', 'male'), ('నేను', 'h...",16,15,5,5
2,bn,-Y9oaa82Vxw,,,,"[('বাবা', 'hindu', 'male'), ('জমিদার মশাই', 'h...",1626,596,855,475
3,bn,0B1QD63DzOc,,,দাস,,1840,687,1077,565
4,bn,0qUmRtgLuPY,,,সাদা,,1649,656,888,512


In [6]:
print(video_analysis_df.shape)
print(language_analyis_df.shape)
print(f"Language distribution in video-analysis-dataframe = "
      f"{collections.Counter(video_analysis_df['Programme Language'])}")
print(f"Language distribution in language-analysis-dataframe = "
      f"{collections.Counter(language_analyis_df['Language'])}")

(1199, 119)
(1186, 10)
Language distribution in video-analysis-dataframe = Counter({'HINDI': 268, 'TAMIL': 240, 'KANNADA': 232, 'BENGALI': 230, 'TELUGU': 229})
Language distribution in language-analysis-dataframe = Counter({'te': 247, 'hi': 240, 'ta': 236, 'bn': 232, 'kn': 231})


In [10]:
display(video_analysis_df[["video_key", "Programme Language"]])
print(f"Number of null video keys = {video_analysis_df['video_key'].isna().sum()}")

Unnamed: 0,video_key,Programme Language
0,,BENGALI
1,,BENGALI
2,,BENGALI
3,,BENGALI
4,,BENGALI
...,...,...
1194,f5GtMzMmDUg,TELUGU
1195,tDKlXzZetw4,TELUGU
1196,uf6I4Ff32Rw,TELUGU
1197,-0CxY8ee6tE,TELUGU


Number of null video keys = 185


In [11]:
display(language_analyis_df[["Video ID", "Language"]])
print(f"Number of video ids ending in mp4 = {language_analyis_df['Video ID'].str.endswith('.mp4').sum()}")

Unnamed: 0,Video ID,Language
0,-PUpZlgFVlI,bn
1,-R6Iou8twaw,bn
2,-Y9oaa82Vxw,bn
3,0B1QD63DzOc,bn
4,0qUmRtgLuPY,bn
...,...,...
242,Tel_51.mp4,te
243,Tel_52.mp4,te
244,Tel_53.mp4,te
245,Tel_54.mp4,te


Number of video ids ending in mp4 = 200


In [9]:
vd_key_lang_df = video_analysis_df[["video_key", "Programme Language"]]
ln_key_lang_df = language_analyis_df[["Video ID", "Language"]]
key_lang_df = vd_key_lang_df.merge(ln_key_lang_df, left_on="video_key", right_on="Video ID")
key_lang_df

Unnamed: 0,video_key,Programme Language,Video ID,Language
0,8V-mhzGr8rA,BENGALI,8V-mhzGr8rA,bn
1,sR0g8hDld8I,BENGALI,sR0g8hDld8I,bn
2,oXdB7Otc6Lo,BENGALI,oXdB7Otc6Lo,bn
3,RMmSjFxOPKY,BENGALI,RMmSjFxOPKY,bn
4,zZ46moWsngI,BENGALI,zZ46moWsngI,bn
...,...,...,...,...
1001,f5GtMzMmDUg,TELUGU,f5GtMzMmDUg,te
1002,tDKlXzZetw4,TELUGU,tDKlXzZetw4,te
1003,uf6I4Ff32Rw,TELUGU,uf6I4Ff32Rw,te
1004,-0CxY8ee6tE,TELUGU,-0CxY8ee6tE,te


In [18]:
language_abbreviation = {"BENGALI": "bn", "HINDI": "hi", "KANNADA": "kn", "TAMIL": "ta", "TELUGU": "te"}
print((key_lang_df["Programme Language"].apply(
            lambda lang: language_abbreviation[lang]) == key_lang_df["Language"]).sum())

1001


In [21]:
non_matching_language_video_keys =  (
    key_lang_df.loc[key_lang_df["Programme Language"]
                    .apply(lambda lang: language_abbreviation[lang]) != key_lang_df["Language"], "video_key"].tolist())
print(non_matching_language_video_keys)

['POwLwJJ0bqk', 'SNKVlxp6OJQ', '_bETsUyh8Ak', '9bt8hCr_ssU', 'P9u3DurSTcA']


In [22]:
video_analysis_df.loc[video_analysis_df["video_key"].isin(non_matching_language_video_keys), 
                      ["YouTu.be link", "video_key", "Notes"]]

Unnamed: 0,YouTu.be link,video_key,Notes
855,https://youtu.be/POwLwJJ0bqk,POwLwJJ0bqk,dubbed in bengali
856,https://youtu.be/SNKVlxp6OJQ,SNKVlxp6OJQ,dubbed in bengali
857,https://youtu.be/_bETsUyh8Ak,_bETsUyh8Ak,dubbed in bengali
858,https://youtu.be/9bt8hCr_ssU,9bt8hCr_ssU,dubbed in bengali
859,https://youtu.be/P9u3DurSTcA,P9u3DurSTcA,dubbed in bengali
