In [1]:
import math
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({"font.size": 18,
                     "font.family": "sans-serif",
                     "figure.figsize": (20, 8),
                     "axes.facecolor": "ffffff",
                     "figure.dpi"       : 200,
                     "legend.fontsize"  : "large",
                     "figure.titlesize" : "medium",
                     "lines.linewidth": 3,
})

In [2]:
# set filenames
video_analysis_file = "data/[MUSE India] [RP Outputs] - Muse_India_Study_yt_local.csv.csv"
languages = ["Bengali", "Hindi", "Kannada", "Tamil", "Telugu"]
language_analyis_files = [
    "data/[MUSE India] [Final] Language Analysis Results - bn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - hi.csv",
    "data/[MUSE India] [Final] Language Analysis Results - kn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - ta.csv",
    "data/[MUSE India] [Final] Language Analysis Results - te.csv"
    ]

In [3]:
# read the video analysis and language analysis dataframes
video_analysis_df = pd.read_csv(video_analysis_file, index_col=None)
language_analyis_dfs = [pd.read_csv(language_analyis_file, index_col=None) 
                        for language_analyis_file in language_analyis_files]
language_analyis_df = pd.concat(language_analyis_dfs)

In [4]:
# print column names of the video analysis data
for col in video_analysis_df.columns:
    print(col)

Year
Rank
Program name
Channel
Program Theme
Program Genre
Programme Language
# of episodes
rat%/AP
Daily Avg Rch%
Daily Avg Rch'000
Ats(viewer)
Program duration
Cat No.
YouTu.be link
video_key
Notes
frames_analyzed
(female, [0, 18), [-inf, 1.1))
(female, [0, 18), [1.1, 2.1))
(female, [0, 18), [2.1, 3.1))
(female, [0, 18), [3.1, 4.1))
(female, [0, 18), [4.1, 5.1))
(female, [0, 18), [5.1, 6.1))
(female, [0, 18), [6.1, 7.1))
(female, [0, 18), [7.1, 8.1))
(female, [0, 18), [8.1, 9.1))
(female, [18, 33), [-inf, 1.1))
(female, [18, 33), [1.1, 2.1))
(female, [18, 33), [2.1, 3.1))
(female, [18, 33), [3.1, 4.1))
(female, [18, 33), [4.1, 5.1))
(female, [18, 33), [5.1, 6.1))
(female, [18, 33), [6.1, 7.1))
(female, [18, 33), [7.1, 8.1))
(female, [18, 33), [8.1, 9.1))
(female, [18, 33), [9.1, 10.1))
(female, [33, 60), [-inf, 1.1))
(female, [33, 60), [1.1, 2.1))
(female, [33, 60), [2.1, 3.1))
(female, [33, 60), [3.1, 4.1))
(female, [33, 60), [4.1, 5.1))
(female, [33, 60), [5.1, 6.1))
(female, [33, 

In [5]:
collections.Counter(video_analysis_df["Year"])

Counter({2018: 225, 2019: 231, 2020: 250, 2021: 228, 2022: 265})

In [9]:
video_analysis_df["Program name"].unique().size

152

In [25]:
video_analysis_df[["Program name", "Year"]].groupby("Program name")["Year"].unique()

Program name
AALTA PHORING                         [2022]
ABHIYUM NANUM             [2020, 2021, 2022]
ADORINI                               [2018]
AGNISAKSHI                            [2018]
ALO CHHAYA                            [2019]
                                 ...        
VISHNU PURAN                          [2020]
YAARE NEE MOHINI          [2018, 2019, 2020]
YAMALEELA                             [2020]
YEH HAI CHAHATEIN                     [2022]
ZINDAGI MERE GHAR AANA                [2022]
Name: Year, Length: 152, dtype: object

In [10]:
video_analysis_df["Channel"].unique().size

25

In [41]:
df = video_analysis_df[["Program name", "Channel"]].groupby("Program name")["Channel"].unique()
df[df.apply(len) > 1]

Program name
RAMAYAN      [Dangal, Dangal/DD National]
TRINAYANI        [Zee Bangla, Zee Telugu]
Name: Channel, dtype: object

In [11]:
video_analysis_df["Program Theme"].unique().size

4

In [12]:
collections.Counter(video_analysis_df["Program Theme"])

Counter({'DRAMA/SOAP': 1115,
         'ACTION/THRILLER': 10,
         'MYTHOLOGICAL/COSTUME DRAMAS': 69,
         'CHILDRENS PROGRAM': 5})

In [35]:
video_analysis_df[["Program name", "Program Theme"]].groupby("Program name")["Program Theme"].unique().apply(len).unique()

array([1])

In [40]:
df = video_analysis_df[["Program name", "Programme Language"]].groupby("Program name")["Programme Language"].unique()
df[df.apply(len) > 1]

Program name
TRINAYANI    [BENGALI, TELUGU]
Name: Programme Language, dtype: object

In [44]:
video_analysis_df.groupby(["Year", "Programme Language"]).count()["Program name"]

Year  Programme Language
2018  BENGALI               35
      HINDI                 47
      KANNADA               49
      TAMIL                 45
      TELUGU                49
2019  BENGALI               36
      HINDI                 50
      KANNADA               45
      TAMIL                 50
      TELUGU                50
2020  BENGALI               41
      HINDI                 65
      KANNADA               50
      TAMIL                 45
      TELUGU                49
2021  BENGALI               49
      HINDI                 50
      KANNADA               49
      TAMIL                 50
      TELUGU                30
2022  BENGALI               69
      HINDI                 56
      KANNADA               39
      TAMIL                 50
      TELUGU                51
Name: Program name, dtype: int64

In [53]:
gender_cats = ["male", "female"]
age_cats = ["[0, 18)", "[18, 33)", "[33, 60)", "[60, inf)"]
skintone_cats = ["[-inf, 1.1)", "[1.1, 2.1)", "[2.1, 3.1)", "[3.1, 4.1)", "[4.1, 5.1)", "[5.1, 6.1)", "[6.1, 7.1)",
                 "[7.1, 8.1)", "[8.1, 9.1)", "[9.1, 10.1)"]
n_faces_arr = np.zeros((len(video_analysis_df), len(gender_cats), len(age_cats), len(skintone_cats)), dtype=int)

for index, row in video_analysis_df.iterrows():
    for i, gender_cat in enumerate(gender_cats):
        for j, age_cat in enumerate(age_cats):
            for k, skintone_cat in enumerate(skintone_cats):
                cat = f"({gender_cat}, {age_cat}, {skintone_cat})"
                if cat in row and pd.notna(row[cat]):
                    n_faces_arr[index, i, j, k] = row[cat]

In [56]:
n_faces_arr[0].sum(axis=(1,2))

array([ 4641, 12023])

In [63]:
row[["masculine_faces", "feminine_faces"]].tolist()

[7130, 13146]

In [67]:
for index, row in video_analysis_df.iterrows():
    assert n_faces_arr[index].sum() == row["faces"]
    assert all(n_faces_arr[index].sum(axis=(1, 2)) == row[["masculine_faces", "feminine_faces"]].tolist())
    assert all(n_faces_arr[index].sum(axis=(0, 2)) == row[age_cats].tolist())
    assert all(n_faces_arr[index].sum(axis=(0, 1)) == row[[f"mst_scale_{i}" for i in range(1, 11)]])

In [77]:
metadata_df = video_analysis_df[["Year", "Program name", "Programme Language"]].copy()
metadata_df.columns = ["year", "name", "lang"]
metadata_df["video-id"] = np.arange(len(video_analysis_df))
metadata_df = metadata_df[["video-id", "name", "year", "lang"]]

In [78]:
metadata_df.iloc[0].tolist()

[0, 'ADORINI', 2018, 'BENGALI']

In [79]:
gender_rows, age_rows, skintone_rows = [], [], []

for index, row in metadata_df.iterrows():
    total_faces = n_faces_arr[index].sum()

    male_screentime = n_faces_arr[index, 0].sum()/total_faces
    female_screentime = n_faces_arr[index, 1].sum()/total_faces

    young_screentime = n_faces_arr[index, :, 0].sum()/total_faces
    adult_screentime = n_faces_arr[index, :, 1].sum()/total_faces
    middle_screentime = n_faces_arr[index, :, 2].sum()/total_faces
    old_screentime = n_faces_arr[index, :, 3].sum()/total_faces

    light_screentime = n_faces_arr[index, :, :, :3].sum()/total_faces
    medium_screentime = n_faces_arr[index, :, :, 3:6].sum()/total_faces
    dark_screentime = n_faces_arr[index, :, :, 6:].sum()/total_faces

    gender_rows.append(row.tolist() + ["male", male_screentime])
    gender_rows.append(row.tolist() + ["female", female_screentime])
    age_rows.append(row.tolist() + ["young", young_screentime])
    age_rows.append(row.tolist() + ["adult", adult_screentime])
    age_rows.append(row.tolist() + ["middle-aged", middle_screentime])
    age_rows.append(row.tolist() + ["old", old_screentime])
    skintone_rows.append(row.tolist() + ["light", light_screentime])
    skintone_rows.append(row.tolist() + ["medium", medium_screentime])
    skintone_rows.append(row.tolist() + ["dark", dark_screentime])

gender_df = pd.DataFrame(gender_rows, columns=metadata_df.columns.tolist() + ["gender", "screentime"])
age_df = pd.DataFrame(age_rows, columns=metadata_df.columns.tolist() + ["age", "screentime"])
skintone_df = pd.DataFrame(skintone_rows, columns=metadata_df.columns.tolist() + ["skintone", "screentime"])



In [80]:
gender_df

Unnamed: 0,video-id,name,year,lang,gender,screentime
0,0,ADORINI,2018,BENGALI,male,0.278505
1,0,ADORINI,2018,BENGALI,female,0.721495
2,1,ADORINI,2018,BENGALI,male,0.611043
3,1,ADORINI,2018,BENGALI,female,0.388957
4,2,ADORINI,2018,BENGALI,male,0.266013
...,...,...,...,...,...,...
2393,1196,NUMBER 1 KODALU,2022,TELUGU,female,0.430955
2394,1197,NUMBER 1 KODALU,2022,TELUGU,male,0.731627
2395,1197,NUMBER 1 KODALU,2022,TELUGU,female,0.268373
2396,1198,NUMBER 1 KODALU,2022,TELUGU,male,0.351647


In [81]:
age_df

Unnamed: 0,video-id,name,year,lang,age,screentime
0,0,ADORINI,2018,BENGALI,young,0.009482
1,0,ADORINI,2018,BENGALI,adult,0.881901
2,0,ADORINI,2018,BENGALI,middle-aged,0.108497
3,0,ADORINI,2018,BENGALI,old,0.000120
4,1,ADORINI,2018,BENGALI,young,0.006727
...,...,...,...,...,...,...
4791,1197,NUMBER 1 KODALU,2022,TELUGU,old,0.000000
4792,1198,NUMBER 1 KODALU,2022,TELUGU,young,0.032797
4793,1198,NUMBER 1 KODALU,2022,TELUGU,adult,0.847850
4794,1198,NUMBER 1 KODALU,2022,TELUGU,middle-aged,0.119353


In [82]:
skintone_df

Unnamed: 0,video-id,name,year,lang,skintone,screentime
0,0,ADORINI,2018,BENGALI,light,0.623800
1,0,ADORINI,2018,BENGALI,medium,0.362638
2,0,ADORINI,2018,BENGALI,dark,0.013562
3,1,ADORINI,2018,BENGALI,light,0.907376
4,1,ADORINI,2018,BENGALI,medium,0.091804
...,...,...,...,...,...,...
3592,1197,NUMBER 1 KODALU,2022,TELUGU,medium,0.197996
3593,1197,NUMBER 1 KODALU,2022,TELUGU,dark,0.032999
3594,1198,NUMBER 1 KODALU,2022,TELUGU,light,0.853719
3595,1198,NUMBER 1 KODALU,2022,TELUGU,medium,0.143815


In [83]:
gender_df.to_csv("gender.csv", index=False)
age_df.to_csv("age.csv", index=False)
skintone_df.to_csv("skintone.csv", index=False)