In [2]:
import os
import json
import pandas as pd
import numpy as np
import re

In [3]:
dir_path = "/users/carolinejung/CS315-proj3-group2/1-data_collection/" #CHANGE ME!

In [4]:
def clean_description(description):
    """
    Helper function, takes video description and splits into words, removes punctuation, emojis and stop words.
    """
    if pd.isna(description):  
        return [] 
    
    # remove numbers from the text
    description = re.sub(r'\d+', '', description)

    # split the description into words
    words = description.split()
    
    # remove punctuation and emojis, make everything lowercase
    cleaned_words = [re.sub(r'[^\w\s]', '', word).lower() for word in words]
    #return cleaned_words
    
    # remove stop words and words containing stop hashtags
    #cleaned_words = [word for word in cleaned_words if word not in stop_words and not any(stop_tag in word for stop_tag in stop_hashtags)]

    # remove empty strings
    cleaned_words = [word for word in cleaned_words if word]
    return cleaned_words
    
    # end_string = ""
    # for word in cleaned_words:
    #     if word:
    #         end_string += word + " "
    # return end_string[:-1]

clean_description("hi hi hadf! this is a test!,   . . hello")

['hi', 'hi', 'hadf', 'this', 'is', 'a', 'test', 'hello']

In [5]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    files = os.listdir(dir_path + filepath)

    all_description = []
    for file in files:
        with open(dir_path + filepath + file, 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
            except:
                pass

    cleaned = [clean_description(desc) for desc in all_description]
    data = pd.DataFrame()
    data["description_string"] = all_description
    data["description_list"] = cleaned
    return data

df_M = get_description("M")
df_F = get_description("F")

get_description("F")

Unnamed: 0,description_string,description_list
0,we all know who the grand puppet master is in ...,"[we, all, know, who, the, grand, puppet, maste..."
1,if the majority wants to talk about dark money...,"[if, the, majority, wants, to, talk, about, da..."
2,"the party of ‘law and order” 🤔lol, they litera...","[the, party, of, law, and, order, lol, they, l..."
3,no one should have the right to tell women wha...,"[no, one, should, have, the, right, to, tell, ..."
4,i ran for congress because i was sick of polit...,"[i, ran, for, congress, because, i, was, sick,..."
...,...,...
441,we vote within,"[we, vote, within]"
442,as long as you’re in line by 8pm the polls mus...,"[as, long, as, youre, in, line, by, pm, the, p..."
443,be like yama and make your voice heard!,"[be, like, yama, and, make, your, voice, heard]"
444,"don’t mail your ballot, drop it off at a dropb...","[dont, mail, your, ballot, drop, it, off, at, ..."


In [15]:
# get all words & convert them into unique word list
def unique_words(gender):
    df = get_description(gender)
    unique = set()
    df["description_list"].apply(unique.update)
    return list(unique)

#sorted(unique_words("F"))

#### Bag of words

In [7]:
import string

In [8]:
def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def bag_of_words(gender):
    sentences = get_description("F")["description_string"]
    voc = unique_words("F")
    sent2vec = [text2vector(sent, voc) for sent in sentences]
    df = pd.DataFrame(sent2vec, 
                  columns=voc,
                  index=[f"doc_{i+1}" for i in range(len(sentences))])
    return df

#bag_of_words("F").to_csv("test.csv")

### Calculate tf-idf

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# visualizing
tfidfvectorizer = TfidfVectorizer()
sentences = get_description("F")["description_string"]
X = tfidfvectorizer.fit_transform(sentences)
pd.DataFrame(X.toarray(), columns=tfidfvectorizer.get_feature_names_out())
# for entire description text

Unnamed: 0,000,013,100,11,118th,122,12th,14,142,1992,...,ymca,york,you,young,your,yourself,youth,zakaria,zusic,zydeco
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.328422,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.279634,0.0,0.0,0.0,0.0,0.0
444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.112038,0.0,0.134646,0.0,0.0,0.0,0.0,0.0


### Clustering

In [11]:
from sklearn.cluster import KMeans

In [20]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

def create_clusters(gender, k):
    sentences = get_description(gender)["description_string"]
    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
    tfidf = tfidf_vectorizer.fit_transform(sentences)

    kmeans = KMeans(n_clusters=k).fit(tfidf)
    clusters = kmeans.predict(tfidf_vectorizer.transform(sentences))

    final_clusters = {}
    for i in range(k):
        data = sentences
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
    return final_clusters

create_clusters("F", 6)

{'cluster0': ['drum roll please! introducing our guest for the state of the union: dr. destinie marquez! dr. marquez is an obgyn at unm who focuses on reproductive rights, addressing social disparities affecting patient health, and advocating for her patients at a legislative level for better healthcare.',
  "it's been three years since trump and maga extremists led an insurrection at the capitol on jan 6. at every turn, trump has tried to undermine and overturn our democracy for personal gain.",
  'so much fun at thr skamania county parade and fair yesterday!',
  'think a maga republican can’t win in washington state? think again. there’s too much at stake to sit this year out.',
  'happy birthday, president obama!',
  '',
  '',
  '',
  '',
  'thank you mr. president 🙏🏾',
  'yikes.',
  '🗣️ mayor turner',
  '🍔🍟',
  'thank you city wide club super feast 💪🏾💪🏾',
  '💃🏾🕺🏾',
  '❤️',
  'houston, let’s do this.',
  'city hall must always reflect our values.',
  'thank you, beto 😎',
  'i *reall

In [19]:
# calculate most frequent terms in each cluster? - consider wildcards
import operator as op

def freq_words(gender, k):
    final_clusters = create_clusters(gender, k)
    result = []
    for cluster in final_clusters:
        descriptions = final_clusters[cluster]
        all_words = ""
        for video in descriptions:
            for word in clean_description(video):
                all_words += word + " "
        
        # TO DO: need to filter out stop words
        res = {key: op.countOf(all_words.split(), key) for key in all_words.split()}
        final = sorted(res.items(), key=lambda x:x[1], reverse=True)
        result.append(final)

        # TO DO: store in dictionary that stores which clusters & only get the top ~10 words
        
    return result

freq_words("F", 6)

[[('the', 294),
  ('of', 168),
  ('to', 149),
  ('and', 132),
  ('a', 96),
  ('in', 91),
  ('our', 73),
  ('for', 69),
  ('is', 59),
  ('on', 52),
  ('that', 52),
  ('we', 51),
  ('i', 48),
  ('are', 46),
  ('this', 39),
  ('with', 38),
  ('people', 27),
  ('up', 23),
  ('my', 22),
  ('it', 22),
  ('its', 21),
  ('be', 21),
  ('congress', 20),
  ('at', 20),
  ('as', 19),
  ('you', 19),
  ('all', 18),
  ('from', 18),
  ('their', 18),
  ('who', 17),
  ('not', 17),
  ('republicans', 16),
  ('about', 16),
  ('has', 16),
  ('just', 15),
  ('have', 14),
  ('us', 14),
  ('out', 14),
  ('justice', 14),
  ('can', 13),
  ('here', 13),
  ('one', 13),
  ('they', 12),
  ('them', 12),
  ('now', 12),
  ('some', 12),
  ('new', 12),
  ('how', 12),
  ('make', 12),
  ('week', 12),
  ('an', 12),
  ('black', 12),
  ('when', 12),
  ('student', 12),
  ('but', 11),
  ('because', 11),
  ('time', 11),
  ('every', 11),
  ('into', 11),
  ('by', 11),
  ('me', 11),
  ('will', 11),
  ('most', 11),
  ('his', 10),
  (