### Clustering Video Descriptions with tf-idf
Last modified: 4/30/24 \
Author: Caroline Jung \
Note: this is an archived method

#### Data Cleaning & Setup

In [1]:
import os
import json
import pandas as pd
import numpy as np
import re
from datetime import date
import operator as op

In [2]:
dir_path = '/users/carolinejung/CS315-proj3-group2/' #CHANGE ME!

In [3]:
def clean_description(description):
    """
    Helper function, takes video description and splits into words, removes punctuation, emojis and stop words.
    """
    if pd.isna(description):  
        return [] 
    
    description = re.sub(r'\d+', '', description) # remove numbers from the text
    words = description.split()  # split the description into words
    cleaned_words = [re.sub(r'[^\w\s]', '', word).lower() for word in words] # remove punctuation and emojis, make everything lowercase
    
    # remove stop words and words containing stop hashtags
    #cleaned_words = [word for word in cleaned_words if word not in stop_words and not any(stop_tag in word for stop_tag in stop_hashtags)]

    cleaned_words = [word for word in cleaned_words if word] # remove empty strings
    return cleaned_words


clean_description("hi hi hey there! this is a test!,   . . hello")

['hi', 'hi', 'hey', 'there', 'this', 'is', 'a', 'test', 'hello']

In [4]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    folder_path = os.path.join(dir_path + '/1-data_collection/')
    files = os.listdir(folder_path + filepath)

    all_description = []
    all_ids = []
    for file in files:
        with open(os.path.join(folder_path + filepath + file), 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
                all_ids.append(video["id"])
            except:
                pass

    cleaned = [clean_description(desc) for desc in all_description]
    data = pd.DataFrame()
    data["video_id"] = all_ids
    data["description_string"] = all_description
    data["description_list"] = cleaned

    today = str(date.today()).replace('-','_')
    data.to_csv(f"{gender}_all_vid_descriptions_{today}.csv", index=False)
    return data

df_M = get_description("M")
df_F = get_description("F")

In [5]:
get_description("F")
get_description("M")

Unnamed: 0,video_id,description_string,description_list
0,7359310165461568798,do you know what drives me nuts about what’s g...,"[do, you, know, what, drives, me, nuts, about,..."
1,7355557230193773866,totalof my heart ❤️👨‍👩‍👧‍👦,"[totalof, my, heart]"
2,7345476833854508319,icymi: i dive into the momentwalked into thean...,"[icymi, i, dive, into, the, momentwalked, into..."
3,7345202286848642347,re: joe biden’s stutter,"[re, joe, bidens, stutter]"
4,7344046721384582431,anmessage from my mom.mom was one of the first...,"[anmessage, from, my, mommom, was, one, of, th..."
...,...,...,...
951,7308890266239552814,🚨teaser alert🚨: medicare advantage is one of t...,"[teaser, alert, medicare, advantage, is, one, ..."
952,7306649030644206890,step 1: strengthen medicare. step 2: medicare ...,"[step, strengthen, medicare, step, medicare, f..."
953,7302143436851809578,the topic of medicare advantage is deeply pers...,"[the, topic, of, medicare, advantage, is, deep..."
954,7301454548730711338,question: what would you call the ex-president...,"[question, what, would, you, call, the, expres..."


In [6]:
# get all words & convert them into unique word list
def unique_words(gender):
    df = get_description(gender)
    unique = set()
    df["description_list"].apply(unique.update)
    return list(unique)

sorted(unique_words("F"))

['a',
 'ability',
 'able',
 'aboah',
 'abortion',
 'about',
 'above',
 'abq',
 'abuse',
 'access',
 'accessibility',
 'accessible',
 'accomplishment',
 'account',
 'accountability',
 'accountable',
 'accountit',
 'accurate',
 'achieving',
 'across',
 'act',
 'action',
 'actions',
 'actively',
 'activism',
 'activist',
 'actors',
 'actscotus',
 'actually',
 'ada',
 'add',
 'addiction',
 'addition',
 'address',
 'addresses',
 'addressing',
 'adjourn',
 'admin',
 'ads',
 'advance',
 'advancing',
 'advisors',
 'advocates',
 'advocating',
 'adwoa',
 'affect',
 'affected',
 'affecting',
 'affirmative',
 'afford',
 'affordable',
 'afor',
 'afraid',
 'after',
 'again',
 'against',
 'aganist',
 'agencies',
 'agenda',
 'ago',
 'agorefused',
 'agree',
 'agreements',
 'ahead',
 'aid',
 'ailment',
 'aimed',
 'aims',
 'air',
 'airport',
 'alarayshi',
 'alaskan',
 'albuquerque',
 'alentines',
 'alexis',
 'alito',
 'alive',
 'all',
 'allegheny',
 'allen',
 'allow',
 'almost',
 'alone',
 'along',
 'alo

#### Bag of words

In [7]:
import string

In [8]:
def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def bag_of_words(gender):
    sentences = get_description(gender)["description_string"]
    voc = unique_words(gender)
    sent2vec = [text2vector(sent, voc) for sent in sentences]
    df = pd.DataFrame(sent2vec, 
                  columns=voc,
                  index=[f"doc_{i+1}" for i in range(len(sentences))])
    return df

# bag_of_words("F").to_csv("test.csv")

### Calculate tf-idf

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# visualizing
tfidfvectorizer = TfidfVectorizer()
sentences = get_description("F")["description_string"]
X = tfidfvectorizer.fit_transform(sentences)
pd.DataFrame(X.toarray(), columns=tfidfvectorizer.get_feature_names_out())
# for entire description text

Unnamed: 0,000,013,100,11,118th,122,12th,14,142,1992,...,ymca,york,you,young,your,yourself,youth,zakaria,zusic,zydeco
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.328422,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.279634,0.0,0.0,0.0,0.0,0.0
444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.112038,0.0,0.134646,0.0,0.0,0.0,0.0,0.0


### Clustering

In [11]:
from sklearn.cluster import KMeans

In [12]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

def create_clusters(gender, k):
    sentences = get_description(gender)["description_string"]
    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
    tfidf = tfidf_vectorizer.fit_transform(sentences)

    kmeans = KMeans(n_clusters=k).fit(tfidf)
    clusters = kmeans.predict(tfidf_vectorizer.transform(sentences))

    final_clusters = {}
    for i in range(k):
        data = sentences
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
    return final_clusters

In [15]:
create_clusters("F", 6)
#create_clusters("M", 6)
# clusters do not look very distinct upon manual observation

{'cluster0': ['drum roll please! introducing our guest for the state of the union: dr. destinie marquez! dr. marquez is an obgyn at unm who focuses on reproductive rights, addressing social disparities affecting patient health, and advocating for her patients at a legislative level for better healthcare.',
  "it's been three years since trump and maga extremists led an insurrection at the capitol on jan 6. at every turn, trump has tried to undermine and overturn our democracy for personal gain.",
  'have you voted??? atlanta mayorhas a message for you.is tuesday, may 24th. what’s your voting plan?',
  'ballots are arriving in mailboxes, and if you’re as fired up as i am about protecting mail-in voting then join me tomorrow at esther short park – sign up here: https://mobilize.us/s/dk6ro3',
  'so much fun at thr skamania county parade and fair yesterday!',
  'i just dropped off my ballot for the november 2 election! make your plan to vote today at vote.pa.gov 🗳',
  'democracy is on the 

In [14]:
def freq_words(gender, k):
    """Calculates the most frequent terms in each cluster"""
    final_clusters = create_clusters(gender, k)
    result = []
    for cluster in final_clusters:
        descriptions = final_clusters[cluster]
        all_words = ""
        for video in descriptions:
            for word in clean_description(video):
                all_words += word + " "
    
        res = {key: op.countOf(all_words.split(), key) for key in all_words.split()}
        final = sorted(res.items(), key=lambda x:x[1], reverse=True)
        result.append(final)
    return result

freq_words("F", 6)

[[('the', 4),
  ('and', 2),
  ('people', 2),
  ('of', 2),
  ('i', 1),
  ('witnessed', 1),
  ('havoc', 1),
  ('us', 1),
  ('bullets', 1),
  ('bombs', 1),
  ('waged', 1),
  ('on', 1),
  ('iraq', 1),
  ('these', 1),
  ('are', 1),
  ('actions', 1),
  ('being', 1),
  ('perpetrated', 1),
  ('now', 1),
  ('against', 1),
  ('gazaagainst', 1),
  ('innocent', 1),
  ('civilians', 1),
  ('children', 1)],
 [('the', 394),
  ('to', 341),
  ('and', 226),
  ('of', 207),
  ('a', 182),
  ('in', 157),
  ('for', 146),
  ('we', 128),
  ('is', 128),
  ('our', 127),
  ('this', 92),
  ('are', 84),
  ('on', 80),
  ('i', 75),
  ('you', 70),
  ('that', 69),
  ('with', 67),
  ('have', 51),
  ('it', 48),
  ('about', 47),
  ('their', 44),
  ('be', 43),
  ('will', 42),
  ('your', 42),
  ('my', 41),
  ('from', 41),
  ('at', 37),
  ('not', 36),
  ('right', 34),
  ('its', 34),
  ('up', 33),
  ('all', 32),
  ('they', 32),
  ('as', 31),
  ('people', 30),
  ('who', 29),
  ('what', 28),
  ('now', 28),
  ('congress', 28),
  