In [1]:
import math
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({"font.size": 18,
                     "font.family": "sans-serif",
                     "figure.figsize": (20, 8),
                     "axes.facecolor": "ffffff",
                     "figure.dpi"       : 200,
                     "legend.fontsize"  : "large",
                     "figure.titlesize" : "medium",
                     "lines.linewidth": 3,
})

In [2]:
# set filenames
video_analysis_file = "data/[MUSE India] [RP Outputs] - Muse_India_Study_yt_local.csv.csv"
languages = ["Bengali", "Hindi", "Kannada", "Tamil", "Telugu"]
language_analyis_files = [
    "data/[MUSE India] [Final] Language Analysis Results - bn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - hi.csv",
    "data/[MUSE India] [Final] Language Analysis Results - kn.csv",
    "data/[MUSE India] [Final] Language Analysis Results - ta.csv",
    "data/[MUSE India] [Final] Language Analysis Results - te.csv"
    ]

In [3]:
# read the video analysis and language analysis dataframes
video_analysis_df = pd.read_csv(video_analysis_file, index_col=None)
language_analyis_dfs = [pd.read_csv(language_analyis_file, index_col=None) 
                        for language_analyis_file in language_analyis_files]
language_analyis_df = pd.concat(language_analyis_dfs)

In [4]:
# print column names of the video analysis data
for col in video_analysis_df.columns:
    print(col)

Year
Rank
Program name
Channel
Program Theme
Program Genre
Programme Language
# of episodes
rat%/AP
Daily Avg Rch%
Daily Avg Rch'000
Ats(viewer)
Program duration
Cat No.
YouTu.be link
video_key
Notes
frames_analyzed
(female, [0, 18), [-inf, 1.1))
(female, [0, 18), [1.1, 2.1))
(female, [0, 18), [2.1, 3.1))
(female, [0, 18), [3.1, 4.1))
(female, [0, 18), [4.1, 5.1))
(female, [0, 18), [5.1, 6.1))
(female, [0, 18), [6.1, 7.1))
(female, [0, 18), [7.1, 8.1))
(female, [0, 18), [8.1, 9.1))
(female, [18, 33), [-inf, 1.1))
(female, [18, 33), [1.1, 2.1))
(female, [18, 33), [2.1, 3.1))
(female, [18, 33), [3.1, 4.1))
(female, [18, 33), [4.1, 5.1))
(female, [18, 33), [5.1, 6.1))
(female, [18, 33), [6.1, 7.1))
(female, [18, 33), [7.1, 8.1))
(female, [18, 33), [8.1, 9.1))
(female, [18, 33), [9.1, 10.1))
(female, [33, 60), [-inf, 1.1))
(female, [33, 60), [1.1, 2.1))
(female, [33, 60), [2.1, 3.1))
(female, [33, 60), [3.1, 4.1))
(female, [33, 60), [4.1, 5.1))
(female, [33, 60), [5.1, 6.1))
(female, [33, 

In [5]:
# print years
collections.Counter(video_analysis_df["Year"])

Counter({2018: 225, 2019: 231, 2020: 250, 2021: 228, 2022: 265})

In [6]:
# print number of unique program names
video_analysis_df["Program name"].unique().size

152

In [7]:
# check relation between program name and year
video_analysis_df[["Program name", "Year"]].groupby("Program name")["Year"].unique()

Program name
AALTA PHORING                         [2022]
ABHIYUM NANUM             [2020, 2021, 2022]
ADORINI                               [2018]
AGNISAKSHI                            [2018]
ALO CHHAYA                            [2019]
                                 ...        
VISHNU PURAN                          [2020]
YAARE NEE MOHINI          [2018, 2019, 2020]
YAMALEELA                             [2020]
YEH HAI CHAHATEIN                     [2022]
ZINDAGI MERE GHAR AANA                [2022]
Name: Year, Length: 152, dtype: object

In [8]:
# print number of unique channels
video_analysis_df["Channel"].unique().size

25

In [9]:
# print the unique channels
sorted(video_analysis_df["Channel"].unique())

['Big Magic',
 'Colors',
 'Colors Kannada',
 'Colors Rishtey',
 'DD Bharati',
 'DD National',
 'Dangal',
 'Dangal/DD National',
 'ETV Telugu',
 'STAR Jalsha',
 'STAR Maa',
 'STAR Plus',
 'STAR Utsav',
 'STAR Vijay',
 'Shemaroo TV',
 'Sony Pal',
 'Sun TV',
 'Udaya TV',
 'Zee Anmol',
 'Zee Bangla',
 'Zee Kannada',
 'Zee TV',
 'Zee TV/Zee Anmol',
 'Zee Tamil',
 'Zee Telugu']

In [9]:
# print program names broadcast in more than one channel
df = video_analysis_df[["Program name", "Channel"]].groupby("Program name")["Channel"].unique()
df[df.apply(len) > 1]

Program name
RAMAYAN      [Dangal, Dangal/DD National]
TRINAYANI        [Zee Bangla, Zee Telugu]
Name: Channel, dtype: object

In [14]:
# print distribution of program themes
collections.Counter(video_analysis_df["Program Theme"])

Counter({'DRAMA/SOAP': 1115,
         'ACTION/THRILLER': 10,
         'MYTHOLOGICAL/COSTUME DRAMAS': 69,
         'CHILDRENS PROGRAM': 5})

In [15]:
# check correspondence between program name and program theme
video_analysis_df[["Program name", "Program Theme"]].groupby("Program name")["Program Theme"].unique().apply(len).unique()

array([1])

In [16]:
# print program names broadcast in more than one language
df = video_analysis_df[["Program name", "Programme Language"]].groupby("Program name")["Programme Language"].unique()
df[df.apply(len) > 1]

Program name
TRINAYANI    [BENGALI, TELUGU]
Name: Programme Language, dtype: object

In [17]:
# print number of videos per year per language
video_analysis_df.groupby(["Year", "Programme Language"]).count()["Program name"]

Year  Programme Language
2018  BENGALI               35
      HINDI                 47
      KANNADA               49
      TAMIL                 45
      TELUGU                49
2019  BENGALI               36
      HINDI                 50
      KANNADA               45
      TAMIL                 50
      TELUGU                50
2020  BENGALI               41
      HINDI                 65
      KANNADA               50
      TAMIL                 45
      TELUGU                49
2021  BENGALI               49
      HINDI                 50
      KANNADA               49
      TAMIL                 50
      TELUGU                30
2022  BENGALI               69
      HINDI                 56
      KANNADA               39
      TAMIL                 50
      TELUGU                51
Name: Program name, dtype: int64

In [10]:
# create faces data
# n_faces_arr[i, j, k, l] = number of faces of gender j and age k and skintone l in video i
gender_cats = ["male", "female"]
age_cats = ["[0, 18)", "[18, 33)", "[33, 60)", "[60, inf)"]
skintone_cats = ["[-inf, 1.1)", "[1.1, 2.1)", "[2.1, 3.1)", "[3.1, 4.1)", "[4.1, 5.1)", "[5.1, 6.1)", "[6.1, 7.1)",
                 "[7.1, 8.1)", "[8.1, 9.1)", "[9.1, 10.1)"]
n_faces_arr = np.zeros((len(video_analysis_df), len(gender_cats), len(age_cats), len(skintone_cats)), dtype=int)

for index, row in video_analysis_df.iterrows():
    for i, gender_cat in enumerate(gender_cats):
        for j, age_cat in enumerate(age_cats):
            for k, skintone_cat in enumerate(skintone_cats):
                cat = f"({gender_cat}, {age_cat}, {skintone_cat})"
                if cat in row and pd.notna(row[cat]):
                    n_faces_arr[index, i, j, k] = row[cat]

In [19]:
# check if the number of faces for gender, age, and skintone is consistent
for index, row in video_analysis_df.iterrows():
    assert n_faces_arr[index].sum() == row["faces"]
    assert all(n_faces_arr[index].sum(axis=(1, 2)) == row[["masculine_faces", "feminine_faces"]].tolist())
    assert all(n_faces_arr[index].sum(axis=(0, 2)) == row[age_cats].tolist())
    assert all(n_faces_arr[index].sum(axis=(0, 1)) == row[[f"mst_scale_{i}" for i in range(1, 11)]])

In [12]:
# create metadata df of video key, year, program name, and language
metadata_df = video_analysis_df[["Year", "Program name", "Programme Language", "Program Theme"]].copy()
metadata_df.columns = ["year", "program", "lang", "genre"]
metadata_df["video_key"] = np.arange(len(video_analysis_df))
metadata_df = metadata_df[["video_key", "program", "year", "lang", "genre"]]

In [13]:
# create metadata and video analysis dataframe
# columns are video_key, program name, year, lang, gender, age, skintone, number of faces
# each video_key will have 2 x 4 x 3 rows = 24 rows
data_rows = []
named_age_cats = ["young", "adult", "middle_aged", "old"]
named_skintone_cats = ["light", "medium", "dark"]

for index, row in metadata_df.iterrows():
    total_faces = n_faces_arr[index].sum()

    for i, gender_cat in enumerate(gender_cats):
        for j, age_cat in enumerate(named_age_cats):
            for skintone_cat in named_skintone_cats:
                if skintone_cat == "light":
                    k1, k2 = 0, 3
                elif skintone_cat == "medium":
                    k1, k2 = 3, 6
                else:
                    k1, k2 = 6, 10
                faces = n_faces_arr[index, i, j, k1 : k2].sum()
                data_rows.append(row.tolist() + [gender_cat, age_cat, skintone_cat, faces])


data_df = pd.DataFrame(data_rows, columns=metadata_df.columns.tolist() + ["gender", "age", "skintone", "faces"])
data_df.to_csv("data_long_form.csv", index=False)

In [14]:
data_df

Unnamed: 0,video_key,program,year,lang,genre,gender,age,skintone,faces
0,0,ADORINI,2018,BENGALI,DRAMA/SOAP,male,young,light,3
1,0,ADORINI,2018,BENGALI,DRAMA/SOAP,male,young,medium,0
2,0,ADORINI,2018,BENGALI,DRAMA/SOAP,male,young,dark,1
3,0,ADORINI,2018,BENGALI,DRAMA/SOAP,male,adult,light,2211
4,0,ADORINI,2018,BENGALI,DRAMA/SOAP,male,adult,medium,1390
...,...,...,...,...,...,...,...,...,...
28771,1198,NUMBER 1 KODALU,2022,TELUGU,DRAMA/SOAP,female,middle_aged,medium,0
28772,1198,NUMBER 1 KODALU,2022,TELUGU,DRAMA/SOAP,female,middle_aged,dark,0
28773,1198,NUMBER 1 KODALU,2022,TELUGU,DRAMA/SOAP,female,old,light,0
28774,1198,NUMBER 1 KODALU,2022,TELUGU,DRAMA/SOAP,female,old,medium,0
