In [2]:
!pip install -r requirements.txt
import pandas as pd

df = pd.read_csv('content.csv')

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from rake_nltk import Rake
from msticpy.data import data_obfus

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gr8monk3ys/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gr8monk3ys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,author,timestamp
0,0,Another step closer to ruining the internet fo...,creggor,2024-02-13T21:40:30.886000+0000
1,1,Chinese researchers develop calcium-based batt...,chrisdh79,2024-02-13T12:04:27.407000+0000
2,2,Replacement cartilage can grow in any shape wi...,chrisdh79,2024-02-13T21:07:53.955000+0000
3,3,OpenAI CEO warns that 'societal misalignments'...,JackFisherBooks,2024-02-13T20:21:39.876000+0000
4,4,"28-ton, 1.2-megawatt tidal kite is now exporti...",chrisdh79,2024-02-12T13:05:41.010000+0000


In [8]:
def process(raw_df):
    '''
    Wendy & Lorenzo's part
    input: the dataframe returned by the scrape() function
    output: the processed dataframe that is about to be stored in db
    '''
    processed_df = raw_df.drop_duplicates()
    #processed_df['timestamp'] = pd.to_datetime(processed_df['timestamp'], unit='s')
    #processed_df['domain'] = processed_df['title'].str.extract(r'\((.*?)\)')
    #processed_df.drop(columns=['title'], inplace=True)
    #processed_df.rename(columns={'author': 'post_author', 'timestamp': 'post_timestamp'}, inplace=True)

    keyword_list = []
    for item in processed_df['title']:
        rake = Rake() # create rake object
        rake.extract_keywords_from_text(item) 
        score_phrase_pair = rake.get_ranked_phrases_with_scores() # return key phrases and its scores 
        # limit phrases with score that is 4 and up
        phrase_with_scores_five_and_up = [phrase for score, phrase in score_phrase_pair if score>=4]
        # limit topic phrases to 5
        phrase_with_scores_five_and_up = phrase_with_scores_five_and_up[:5]
        keyword_list.append(phrase_with_scores_five_and_up)

    processed_df['keywords'] = keyword_list
    
    # turn the list into string so that it can be stored in db
    processed_df['keywords'] = processed_df['keywords'].apply(lambda x: ','.join(map(str, x)))
    masked_user = []
    for username in processed_df['author']:
        masked_user.append(data_obfus.hash_account(username))

    processed_df['masked user'] = masked_user

    return processed_df

In [9]:
processed_df = process(df)
processed_df.head()

Unnamed: 0.1,Unnamed: 0,title,author,timestamp,keywords,masked user
0,0,Another step closer to ruining the internet fo...,creggor,2024-02-13T21:40:30.886000+0000,another step closer,account-#11468
1,1,Chinese researchers develop calcium-based batt...,chrisdh79,2024-02-13T12:04:27.407000+0000,"chinese researchers develop calcium,lasts 700 ...",account-#13984
2,2,Replacement cartilage can grow in any shape wi...,chrisdh79,2024-02-13T21:07:53.955000+0000,replacement cartilage,account-#13984
3,3,OpenAI CEO warns that 'societal misalignments'...,JackFisherBooks,2024-02-13T20:21:39.876000+0000,"could make artificial intelligence dangerous,o...",account-#16919
4,4,"28-ton, 1.2-megawatt tidal kite is now exporti...",chrisdh79,2024-02-12T13:05:41.010000+0000,"megawatt tidal kite,exporting power",account-#13984


In [10]:
list_of_keywords = processed_df['keywords'].apply(lambda x: x.split(','))

In [11]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import string
from nltk.tokenize import word_tokenize

ModuleNotFoundError: No module named 'gensim'

In [21]:
# distributed memory
model_dm = Doc2Vec(dm=1, vector_size=25, min_count=2, epochs=30)

In [27]:
title_text = [x.translate(str.maketrans('', '', string.punctuation)) for x in processed_df['title'] if True]
# remove punctuation

In [28]:
title_text_lower = [x.lower() for x in title_text] # make all words lowercased

In [32]:
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i, doc in enumerate(title_text_lower)]

In [34]:
model_dm.build_vocab(tagged_data)

In [36]:
model_dm.train(tagged_data,
            total_examples=model_dm.corpus_count,
            epochs=model_dm.epochs)

In [37]:
document_vectors = [model_dm.infer_vector(word_tokenize(doc.lower())) for doc in title_text_lower]

In [39]:
text_vec_df = pd.DataFrame({'title': processed_df['title'], 
                            'keywords':list_of_keywords, 'vectors': document_vectors})

In [40]:
text_vec_df.head()

Unnamed: 0,title,keywords,vectors
0,Another step closer to ruining the internet fo...,[another step closer],"[-0.03415885, -0.14972238, 0.008515159, -0.168..."
1,Chinese researchers develop calcium-based batt...,"[chinese researchers develop calcium, lasts 70...","[-0.093232565, -0.3394914, -0.012083201, -0.40..."
2,Replacement cartilage can grow in any shape wi...,[replacement cartilage],"[-0.016030788, -0.10607375, 0.0027287838, -0.1..."
3,OpenAI CEO warns that 'societal misalignments'...,"[could make artificial intelligence dangerous,...","[-0.053585082, -0.22113702, -0.017498622, -0.2..."
4,"28-ton, 1.2-megawatt tidal kite is now exporti...","[megawatt tidal kite, exporting power]","[-0.003168633, -0.068083376, -0.01440842, -0.0..."


In [41]:
def document_vector(df):
    '''
    input df produced by the process function (pulled from db version)
    returns a df with 3 columns: reddit post title, list of keyword extracted from title, and document vector corresponding to title
    '''
    # remove punctuations
    title_text = [x.translate(str.maketrans('', '', string.punctuation)) for x in df['title'] if True]
    # lowercase all characters
    title_text_lower = [x.lower() for x in title_text]
    
    # distributed memory
    model_dm = Doc2Vec(dm=1, vector_size=25, min_count=2, epochs=30)
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i, doc in enumerate(title_text_lower)]
    model_dm.build_vocab(tagged_data)
    model_dm.train(tagged_data,
            total_examples=model_dm.corpus_count,
            epochs=model_dm.epochs)
    
    document_vectors = [model_dm.infer_vector(word_tokenize(doc.lower())) for doc in title_text_lower]
    
    text_vec_df = pd.DataFrame({'title': df['title'], 
                            'keywords':list_of_keywords, 'vectors': document_vectors})
    return text_vec_df

In [43]:
document_vector(processed_df)

Unnamed: 0,title,keywords,vectors
0,Another step closer to ruining the internet fo...,[another step closer],"[-0.03415885, -0.14972238, 0.008515159, -0.168..."
1,Chinese researchers develop calcium-based batt...,"[chinese researchers develop calcium, lasts 70...","[-0.093232565, -0.3394914, -0.012083201, -0.40..."
2,Replacement cartilage can grow in any shape wi...,[replacement cartilage],"[-0.016030788, -0.10607375, 0.0027287838, -0.1..."
3,OpenAI CEO warns that 'societal misalignments'...,"[could make artificial intelligence dangerous,...","[-0.053585082, -0.22113702, -0.017498622, -0.2..."
4,"28-ton, 1.2-megawatt tidal kite is now exporti...","[megawatt tidal kite, exporting power]","[-0.003168633, -0.068083376, -0.01440842, -0.0..."
...,...,...,...
73,MIT’s New Desalination System Produces Freshwa...,"[new desalination system produces freshwater, ...","[-0.027722199, -0.11565785, 0.015336563, -0.14..."
74,Biotagging method for animal identification us...,[animal identification using dissolvable micro...,"[-0.031671435, -0.1396776, 0.009595447, -0.129..."
75,The innovation that gets an Alzheimer’s drug t...,"[one strategy researchers, get drugs, focused ...","[-0.12904331, -0.5103372, -0.012194232, -0.597..."
76,Autophage rocket engine consumes plastic fusel...,[autophage rocket engine consumes plastic fuse...,"[-0.027845373, -0.1666336, -0.00666649, -0.179..."
