In [2]:
import os
import json
import pandas as pd
import numpy as np
import re
from datetime import date

In [3]:
dir_path = '/Users/tayaerogers/Documents/MEDSL/GitHub/CS315-proj3-group2/' #CHANGE ME!

In [4]:
def clean_description(description):
    """
    Helper function, takes video description and splits into words, removes punctuation, emojis and stop words.
    """
    if pd.isna(description):  
        return [] 
    
    # remove numbers from the text
    description = re.sub(r'\d+', '', description)

    # split the description into words
    words = description.split()
    
    # remove punctuation and emojis, make everything lowercase
    cleaned_words = [re.sub(r'[^\w\s]', '', word).lower() for word in words]
    #return cleaned_words
    
    # remove stop words and words containing stop hashtags
    #cleaned_words = [word for word in cleaned_words if word not in stop_words and not any(stop_tag in word for stop_tag in stop_hashtags)]

    # remove empty strings
    cleaned_words = [word for word in cleaned_words if word]
    return cleaned_words
    
    # end_string = ""
    # for word in cleaned_words:
    #     if word:
    #         end_string += word + " "
    # return end_string[:-1]

clean_description("hi hi hadf! this is a test!,   . . hello")

['hi', 'hi', 'hadf', 'this', 'is', 'a', 'test', 'hello']

In [6]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    folder_path = os.path.join(dir_path + '/1-data_collection/')
    files = os.listdir(folder_path + filepath)

    all_description = []
    all_ids = []
    for file in files:
        with open(os.path.join(folder_path + filepath + file), 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
                all_ids.append(video["id"])
        
            except:
                pass

    cleaned = [clean_description(desc) for desc in all_description]
    data = pd.DataFrame()
    data["video_id"] = all_ids
    data["description_string"] = all_description
    data["description_list"] = cleaned

    today = str(date.today()).replace('-','_')
    data.to_csv(f"{gender}_all_vid_descriptions_{today}.csv", index=False)

    return data

df_M = get_description("M")
df_F = get_description("F")

get_description("F")

Unnamed: 0,video_id,description_string,description_list
0,7283952208561933599,we all know who the grand puppet master is in ...,"[we, all, know, who, the, grand, puppet, maste..."
1,7278392468788366634,if the majority wants to talk about dark money...,"[if, the, majority, wants, to, talk, about, da..."
2,7257715712897125675,"the party of ‚Äòlaw and order‚Äù ü§îlol, they litera...","[the, party, of, law, and, order, lol, they, l..."
3,7357176379433094442,no one should have the right to tell women wha...,"[no, one, should, have, the, right, to, tell, ..."
4,7356391987936873771,i ran for congress because i was sick of polit...,"[i, ran, for, congress, because, i, was, sick,..."
...,...,...,...
441,7163807350543928619,we vote within,"[we, vote, within]"
442,7163802367308074282,as long as you‚Äôre in line by 8pm the polls mus...,"[as, long, as, youre, in, line, by, pm, the, p..."
443,7163458954188623150,be like yama and make your voice heard!,"[be, like, yama, and, make, your, voice, heard]"
444,7162576451697511726,"don‚Äôt mail your ballot, drop it off at a dropb...","[dont, mail, your, ballot, drop, it, off, at, ..."


In [6]:
# get all words & convert them into unique word list
def unique_words(gender):
    df = get_description(gender)
    unique = set()
    df["description_list"].apply(unique.update)
    return list(unique)

sorted(unique_words("F"))

['a',
 'ability',
 'able',
 'aboah',
 'abortion',
 'about',
 'above',
 'abq',
 'abuse',
 'access',
 'accessibility',
 'accessible',
 'accomplishment',
 'account',
 'accountability',
 'accountable',
 'accountit',
 'accurate',
 'achieving',
 'across',
 'act',
 'action',
 'actions',
 'actively',
 'activism',
 'activist',
 'actors',
 'actscotus',
 'actually',
 'ada',
 'add',
 'addiction',
 'addition',
 'address',
 'addresses',
 'addressing',
 'adjourn',
 'admin',
 'ads',
 'advance',
 'advancing',
 'advisors',
 'advocates',
 'advocating',
 'adwoa',
 'affect',
 'affected',
 'affecting',
 'affirmative',
 'afford',
 'affordable',
 'afor',
 'afraid',
 'after',
 'again',
 'against',
 'aganist',
 'agencies',
 'agenda',
 'ago',
 'agorefused',
 'agree',
 'agreements',
 'ahead',
 'aid',
 'ailment',
 'aimed',
 'aims',
 'air',
 'airport',
 'alarayshi',
 'alaskan',
 'albuquerque',
 'alentines',
 'alexis',
 'alito',
 'alive',
 'all',
 'allegheny',
 'allen',
 'allow',
 'almost',
 'alone',
 'along',
 'alo

#### Bag of words

In [7]:
import string

In [8]:
def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def bag_of_words(gender):
    sentences = get_description("F")["description_string"]
    voc = unique_words("F")
    sent2vec = [text2vector(sent, voc) for sent in sentences]
    df = pd.DataFrame(sent2vec, 
                  columns=voc,
                  index=[f"doc_{i+1}" for i in range(len(sentences))])
    return df

#bag_of_words("F").to_csv("test.csv")

### Calculate tf-idf

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# visualizing
tfidfvectorizer = TfidfVectorizer()
sentences = get_description("F")["description_string"]
X = tfidfvectorizer.fit_transform(sentences)
pd.DataFrame(X.toarray(), columns=tfidfvectorizer.get_feature_names_out())
# for entire description text

Unnamed: 0,000,013,100,11,118th,122,12th,14,142,1992,...,ymca,york,you,young,your,yourself,youth,zakaria,zusic,zydeco
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.328422,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.279634,0.0,0.0,0.0,0.0,0.0
444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.112038,0.0,0.134646,0.0,0.0,0.0,0.0,0.0


### Clustering

In [11]:
from sklearn.cluster import KMeans

In [12]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

def create_clusters(gender, k):
    sentences = get_description(gender)["description_string"]
    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
    tfidf = tfidf_vectorizer.fit_transform(sentences)

    kmeans = KMeans(n_clusters=k).fit(tfidf)
    clusters = kmeans.predict(tfidf_vectorizer.transform(sentences))

    final_clusters = {}
    for i in range(k):
        data = sentences
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
    return final_clusters
create_clusters("M", 6)

{'cluster0': ['house republicans had a huge loss this week‚ÄºÔ∏èhow women‚Äôs reproductive health rights and bipartisan immigration reform won the day in. üá∫üá∏',
  'house republicans had a big upset this week. why? democrats stayed together.an amazing story about how al green helped us win a bs impeachment vote.',
  'i was glad to join lawrence o‚Äôdonnell again last night to talk about what this win means for the voters of north carolina. if republicans attack our freedoms, voters will reject them at the ballot box. watch here ‚û°Ô∏è',
  'bo? hell no!',
  'üëÄ how much did bo hines pay to be the ‚Äúpresenting sponsor‚Äù of this hate-filled speech by his good friend madison cawthorn?',
  'hmmm ü§î üßê',
  'north carolina voters deserve a representative that will fight to fix their problems, not take away their rights.bill, a johnston county veteran, puts it perfectly in our latest ad: bo? hell no!',
  'why wiley?',
  'an eclipse is rare.human kindness and service are not.',
  'p

In [14]:
# calculate most frequent terms in each cluster? - consider wildcards
import operator as op

def freq_words(gender, k):
    final_clusters = create_clusters(gender, k)
    result = []
    for cluster in final_clusters:
        descriptions = final_clusters[cluster]
        all_words = ""
        for video in descriptions:
            for word in clean_description(video):
                all_words += word + " "
        
        # TO DO: need to filter out stop words
        res = {key: op.countOf(all_words.split(), key) for key in all_words.split()}
        final = sorted(res.items(), key=lambda x:x[1], reverse=True)
        result.append(final)

        # TO DO: store in dictionary that stores which clusters & only get the top ~10 words
        
    return result

freq_words("F", 6)

[[('the', 4),
  ('and', 2),
  ('people', 2),
  ('of', 2),
  ('i', 1),
  ('witnessed', 1),
  ('havoc', 1),
  ('us', 1),
  ('bullets', 1),
  ('bombs', 1),
  ('waged', 1),
  ('on', 1),
  ('iraq', 1),
  ('these', 1),
  ('are', 1),
  ('actions', 1),
  ('being', 1),
  ('perpetrated', 1),
  ('now', 1),
  ('against', 1),
  ('gazaagainst', 1),
  ('innocent', 1),
  ('civilians', 1),
  ('children', 1)],
 [('the', 394),
  ('to', 341),
  ('and', 226),
  ('of', 207),
  ('a', 182),
  ('in', 157),
  ('for', 146),
  ('we', 128),
  ('is', 128),
  ('our', 127),
  ('this', 92),
  ('are', 84),
  ('on', 80),
  ('i', 75),
  ('you', 70),
  ('that', 69),
  ('with', 67),
  ('have', 51),
  ('it', 48),
  ('about', 47),
  ('their', 44),
  ('be', 43),
  ('will', 42),
  ('your', 42),
  ('my', 41),
  ('from', 41),
  ('at', 37),
  ('not', 36),
  ('right', 34),
  ('its', 34),
  ('up', 33),
  ('all', 32),
  ('they', 32),
  ('as', 31),
  ('people', 30),
  ('who', 29),
  ('what', 28),
  ('now', 28),
  ('congress', 28),
  