### Exploratory Data Analysis
Last modified: 5/6/24 \
Author: Caroline Jung & Josie Ramirez

In [5]:
import pandas as pd
import os
import json

In [4]:
dir_path = "/users/carolinejung/CS315-proj3-group2/1-data_collection/" #CHANGE ME!

First, read in all videos, their descriptions, and comments for both female and male politicians

In [6]:
female = pd.read_csv(dir_path+"compiled_data/female.csv")
male = pd.read_csv(dir_path+"compiled_data/male.csv")

In [21]:
def count(file):
    """For a given dataframe (either female or male accounts), count the number of words in all descriptions and comments."""
    num_words_desc, num_words_comments = 0, 0
    for description in file["description_list"]:
        num_words_desc += len(description)
    for comments in file["comments"]:
        for comment in comments:
            num_words_comments += len(comment)
    return num_words_desc, num_words_comments

def summary(gender, file):
    """Output a summary of the number of accounts, words in all descriptions, and words in all comments uploaded by politicians of the same gender"""
    if gender=="F":
        name = "female"
    elif gender=="M":
        name = "male"
    print(f"number of words in all descriptions for {name}: ", count(file)[0])
    print(f"number of words in all comments for {name}: ", count(file)[1])

In [22]:
# get a summary for female politicians
print("number of female accounts: ", female.shape[0])
summary("F", female)

# get a summary for male politicians
print("number of male accounts: ", male.shape[0])
summary("M", male)

number of female accounts:  446
number of words in all descriptions for female:  84514
number of words in all comments for female:  1043260
number of male accounts:  956
number of words in all descriptions for male:  181546
number of words in all comments for male:  2105258


### Exploratory analysis per account

In [9]:
def videoCount(directory, key):
    """Given a directory path (i.e. one account), count how many videos there are."""
    accts = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open (file_path, 'r') as f:
                    data = json.load(f)
                    count = 0
                    #print("Data from {}: {}".format(file, data))  # Add this line for debugging
                    for item in data:
                         if key in item: 
                             count +=1
                    accts[file] = count
    return accts

#### Female accounts
For each account of a female politician, get the number of videos they've uploaded.

In [16]:
accts = videoCount(dir_path + "output_female", "id")
df_female = pd.DataFrame(list(accts.items()), columns=['File', 'Count'])
df_female

Unnamed: 0,File,Count
0,output_repstansbury.json,35
1,output_nikemawilliams.json,3
2,output_marieforcongress.json,12
3,output_repchrissyhoulahan.json,6
4,output_teampattymurray.json,35
5,output_reppressley.json,28
6,output_sheilaforhouston.json,83
7,output_kirstengillibrand.json,75
8,output_repsummerlee.json,56
9,output_aoc.json,12


In [19]:
total_count = df_female['Count'].sum()
print(total_count) #just double checking
print(df_female.loc[df_female['Count'].idxmax()])
print(df_female.sort_values(by='Count', ascending=False))

446
File     output_sheilaforhouston.json
Count                              83
Name: 6, dtype: object
                              File  Count
6     output_sheilaforhouston.json     83
7    output_kirstengillibrand.json     75
8         output_repsummerlee.json     56
11      output_repkatieporter.json     35
0         output_repstansbury.json     35
4      output_teampattymurray.json     35
12             output_ilhanmn.json     35
5          output_reppressley.json     28
10           output_repwilson.json     20
2     output_marieforcongress.json     12
9                  output_aoc.json     12
13        output_rashidatlaib.json     11
3   output_repchrissyhoulahan.json      6
1       output_nikemawilliams.json      3


#### Male accounts
For each account of a male politician, get the number of videos they've uploaded.

In [18]:
accts = videoCount(dir_path + "output_male", "id")
df_male = pd.DataFrame(list(accts.items()), columns=['File', 'Count'])
df_male

Unnamed: 0,File,Count
0,output_wileynickel.json,35
1,output_corybooker.json,35
2,output_jeffjacksonnc.json,103
3,output_greglandsmanoh.json,73
4,output_kevinmullinforcongress.json,32
5,output_repdeanphillips.json,26
6,output_repres.gerryconnolly.json,4
7,output_robertgarcia.json,34
8,output_jon.json,11
9,output_bernie.json,139


In [20]:
total_count_male = df_male['Count'].sum()
print(total_count_male) #just double checking
print(df_male.loc[df_male['Count'].idxmax()])
print(df_male.sort_values(by='Count', ascending=False))

956
File     output_bernie.json
Count                   139
Name: 9, dtype: object
                                  File  Count
9                   output_bernie.json    139
2            output_jeffjacksonnc.json    103
15           output_colinallredtx.json     99
16     output_congressmancardenas.json     74
3           output_greglandsmanoh.json     73
18           output_johnfetterman.json     65
12            output_billpascrell.json     54
13                output_repcasar.json     43
0              output_wileynickel.json     35
14         output_repmaxwellfrost.json     35
1               output_corybooker.json     35
10               output_repbowman.json     34
7             output_robertgarcia.json     34
4   output_kevinmullinforcongress.json     32
5          output_repdeanphillips.json     26
19         output_stevenahorsford.json     21
20            output_repmarkpocan.json     19
11          output_troyc4congress.json     17
8                      output_jon.json     