In [1]:
import requests
import json
import datetime
import re
import pathlib
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup

# whitelist for topics we know are relevant.
whitelist = ["Tietotekniikka", "Internet", "Tv ja radio", "Kodintekniikka"]


# edit this to edit where the result files are stored
storageLocation = "./crawlerResults/"

pathlib.Path(storageLocation).mkdir(parents=True, exist_ok=True)




In [5]:
def getThreads(keyword, pageNumber): # returns the url for every thread from given keyword and pagenumber
    interestingThreads = []

    currentUrl = "https://keskustelu.suomi24.fi/haku?keyword={kw}&page={pn}".format(
        kw=keyword, pn=pageNumber)  # make the url we're looking for

    print(f"current URL: {currentUrl}")

    response = requests.get(currentUrl)  # we get our html

    # we convert our html into soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # find all links that look like this
    for link in soup.find_all(href=re.compile("https://keskustelu.suomi24.fi/t/")):
        interestingThreads.append(link.get('href'))  # keep their href's

    print(f"page {pageNumber} has {len(interestingThreads)} threads regarding {keyword}: {interestingThreads}")
    return interestingThreads


In [6]:
def getAllThreads(keyword, pageFraction): # returns a list of urls to filter and crawl. Automate getThreads, we take a certain fraction of all the pages from search with keyword. if pageFraction >=1, returns pageFraction urls instead.
    urlList = []
    
    if pageFraction < 1:
        baseUrl = "https://keskustelu.suomi24.fi/haku?keyword=" + keyword # the first page that you get when you search with keyword
        
        response = requests.get(baseUrl)  # we get our html

        soup = BeautifulSoup(response.content, 'html.parser') # we convert our html into soup

        pages = int(soup.find(attrs={"class": "pagination-page-count"}).string) # crazy-ass one-liner. finds the tag that shows how many pages of search results there are.

        pagesToCrawl = int(pages * pageFraction) # so it doesn't take 5 years to complete. safe pageFraction is around 1E-2

    else:
        pagesToCrawl = pageFraction

    print(f"\nbegin gathering thread url's from {pagesToCrawl} pages of keyword {keyword}.")
    for pageNumber in range(pagesToCrawl):
        urlList.extend(getThreads(keyword, pageNumber + 1))

    print(f"Succesfully gathered {len(urlList)} url's from the {pagesToCrawl} pages of keyword {keyword}.")

    return urlList



In [7]:
def getContent(threadUrl): # returns a JSON with all the information we're interested in, from the given URL.

    response = requests.get(threadUrl)

    if response.status_code >= 400:
        raise Exception("dead page")

    soup = BeautifulSoup(response.content, 'html.parser')

    # the discussion but in soup tag form we don't want that
    discussionTag = soup.find_all(type="application/ld+json")[0]

    # now it's json
    discussionJson = json.loads(discussionTag.contents[0])
    #print(json.dumps(discussionJson, indent = 4, sort_keys = True))

    return discussionJson




In [8]:
def getBreadcrumbs(discussionJson): # returns the list of breadcrumbs from given json of the discussion
    breadcrumbsJson = discussionJson[2]  # the third element has the crumbs
    breadcrumbsList = []
    itemList = breadcrumbsJson['itemListElement']
    for itemdata in itemList:
        item = itemdata['item']
        # very hard-to-navigate and counterintuitive structure.
        breadcrumbsList.append(item['name'])

    return breadcrumbsList


In [9]:
def crumbsRelevance(breadcrumbsList): # determine if the crumbs correspond to a relevant topic, going through relevantTopics
    for topic in whitelist:
        if topic in breadcrumbsList:
            return True
    return False


In [10]:
def countComments(threadContent): # returns the count of comments from given json of the post and comments
    commentCount = len(threadContent["comment"])
    for currentComment in threadContent["comment"]:
        currentReplies = currentComment.get("comment")
        if currentReplies != None:
            commentCount = commentCount+len(currentReplies)
    return commentCount


In [11]:
def crawlUrlList(urlList, keyword="unspecified", noFilter=False): # return a DataFrame with all the posts and their interesting data from given list of URLs

    posts = {"author": [], "category": [], "url": [], "time": [], "headline": [], "text": [], "commentCount": [], "comments": [], "keyword": []}
    

    totalLength = len(urlList)  # used in progress tracking
    doing = 0

    for threadUrl in urlList:

        doing = doing + 1
        print(f"progress: {doing}/{totalLength}")  # track progress

        # extract useful stuff
        try:
            # only parts of getContent returned JSON is useful. We get that.
            allContent = getContent(threadUrl)
        except Exception as ex:
            print(f"met exception: {threadUrl} is {ex}")
            continue

        # get category
        threadCategory = getBreadcrumbs(allContent)

        # continue only if the category is relevant.
        if crumbsRelevance(threadCategory) or noFilter: 
            
            threadContent = allContent[3] # aside from category, all the data we want is in the 4th item

            creationTime = datetime.datetime.fromisoformat(
                threadContent["dateCreated"][:-1]) # the time is a string. The string has a Z in the end indicating standart time. we don't need that
            
            timePast = datetime.datetime.utcnow() - creationTime
            expiringTime = datetime.timedelta(days=3*365) # edit this to filter older/newer posts

            # continue only if the post isn't too old.
            if timePast < expiringTime:

                # store all the stuff we wish to keep about the post.
                posts["category"].append(threadCategory)
                posts["commentCount"].append(countComments(threadContent))
                posts["comments"].append(threadContent["comment"])
                posts["headline"].append(threadContent["headline"])
                posts["text"].append(threadContent["text"])
                posts["time"].append(threadContent["dateCreated"])
                posts["url"].append(threadUrl)
                posts["author"].append(threadContent["author"]["name"])
                posts["keyword"] = keyword

            else:
                print(f"that is {timePast.days} days ago, way too old.")

        else:
            print(f"{threadUrl} is out-of-context.\n")

    posts_df = pd.DataFrame(posts)
    posts_df.set_index('url', inplace=True)

    print(f"succesfully loaded {len(posts['url'])} out of {totalLength} threads.")
    return posts_df


In [12]:
def extractComments(threadsDf): # return a DataFrame with all the comments and their interesting data from given DataFrame of threads
    comments = {"url": [], "time": [], "text": [], "replyCount": [], "parent": []}


    threadsDf = threadsDf[threadsDf["commentCount"] != 0 ]

    threadsDict = dict(threadsDf["comments"])


    for threadUrl in threadsDict.keys():

        nrtext = "" # text for comments with no replies
        
        for comment in threadsDict[threadUrl]:
            replies = comment.get("comment")

            if replies != None:

                #add this comment into comments
                
                comments["url"].append(comment["url"])
                comments["time"].append(comment["dateCreated"])
                comments["text"].append(comment["text"])
                comments["replyCount"].append(len(replies))
                comments["parent"].append(threadUrl)


                #add its replies into comments
                rtext = ""

                for reply in replies:
                    rtext = rtext + reply["text"] + " " # separator
                
                rTime = replies[-1]["dateCreated"] # newest reply
                rUrl = replies[-1]["url"] # newest reply

                comments["url"].append(rUrl)
                comments["time"].append(rTime)
                comments["text"].append(rtext)
                comments["replyCount"].append(0) # zero for all except comments with replies
                comments["parent"].append(threadUrl)
            
            else:
                nrtext = nrtext + comment["text"] + " " # separator
                nrTime = comment["dateCreated"] # newest no-reply comment
                nrUrl = comment["url"] # newest no-reply comment
        
        comments["url"].append(nrUrl)
        comments["time"].append(nrTime)
        comments["text"].append(nrtext)
        comments["replyCount"].append(0) # zero 
        comments["parent"].append(threadUrl)




    comments_df = pd.DataFrame(comments)
    comments_df.set_index('url', inplace=True)

    print(f"succesfully extracted {len(comments_df.index)} rows of data from {len(threadsDf.index)} threads.")

    return comments_df
            

            




In [13]:
def categoryCounter(posts_df): # returns a sorted series with count for every maincategory, from given posts DataFrame

    # get all categories
    allCategories = posts_df["category"].tolist()

    # use the third element to determine if relevant.
    mainCategories = [category[2] for category in allCategories if len(category)>2]

    # if not possible, then use the second element
    mainCategories.extend([category[1] for category in allCategories if len(category)==2])

    mainCounts = Counter(mainCategories)  # Count the occurrences of each element

    # convert to series
    categorySeries = pd.Series(list(mainCounts.values()), index=list(
        mainCounts.keys()))

    # sort by count
    categorySeries.sort_values(inplace=True, ascending=False)

    return categorySeries


In [14]:
def findSharedCategories(dfDict): #returns a series of shared category name and total posts with said category, from given dict of {"name": DataFrame}

    cSeriesList = [categoryCounter(df) for df in dfDict.values()] # a list of series returned by categoryCounter from elements in allDataFrames

    topNToKeep = 10 #change this to change top howmany do we keep from each category.

    sharedCategorySeries = cSeriesList[0].head(topNToKeep).copy(deep=False) # copies the first series's top n rows, in order to do addition pd.Series properly

    #print(sharedCategorySeries)

    for categorySeries in cSeriesList[1:]: # first one already accounted for
        seriesTopCategories = categorySeries.head(topNToKeep) 

        #print(seriesTopCategories)

        sharedCategorySeries = sharedCategorySeries + seriesTopCategories # only the categories shared by all 3 are kept. rest have value NaN
        
        # sharedCategorySeries = sharedCategorySeries.add(seriesTopCategories, fill_value=-10)

    sharedCategorySeries = sharedCategorySeries.dropna().sort_values(ascending=False) # dropna() gets rid of the ones with NaN

    return sharedCategorySeries

In [15]:
def storeDataFrames(dfDict): # store the given dictionary {"name": DataFrame} as separate csvs, return a dict with {"name": filename for csv}

    csvDict = {}

    for dfName in dfDict.keys(): # for every df in the inputted dict:
        
        csvName = "{loc}{name}_data_{date}.csv".format(loc=storageLocation, name=dfName, date=(datetime.date.today()).strftime('%Y-%m')) # create the name of the file

        dfDict[dfName].to_csv(csvName) # save df as csv

        csvDict[dfName] = csvName # add to the dictionary that this function returns
    
    return csvDict

def readDataFrames(csvDict): # reverse for above function

    dfDict = {}

    for dfName in csvDict.keys(): # for every df in the inputted dict:
        
        csvName = csvDict[dfName] # get the name of the csv file

        dfDict[dfName] = pd.read_csv(csvName) # load the DataFrame from csv file and add to the dictionary that this function returns

    return dfDict
    

In [16]:
pageFraction = 1
#keywordList = ["dna"]
keywordList = ["dna", "elisa", "telia", "sonera"][:2]
allDataFrames = {}

for keyword in keywordList:

    currentThreads = getAllThreads(keyword, pageFraction)
    currentDf = crawlUrlList(currentThreads, keyword, noFilter=False)
    currentDf.sort_values(by="commentCount", inplace=True, ascending=False) # sort by commentCount (popularity)
    allDataFrames[keyword] = currentDf
    print(f"Created new DataFrame {keyword} with {len(currentDf.index)} threads.\n")
    

allcsvDicts = storeDataFrames(allDataFrames)


begin gathering thread url's from 1 pages of keyword dna.
current URL: https://keskustelu.suomi24.fi/haku?keyword=dna&page=1
page 1 has 20 threads regarding dna: ['https://keskustelu.suomi24.fi/t/17878484/ty-prepaid-liittyma-emdnaem', 'https://keskustelu.suomi24.fi/t/17867328/emdnaem-4g-hehh', 'https://keskustelu.suomi24.fi/t/17856629/emdnaem-on-kusettaa-reilust', 'https://keskustelu.suomi24.fi/t/17852538/emdnaemn-liittyma-hairitse', 'https://keskustelu.suomi24.fi/t/17850494/kuinka-postetaan-emdnaem-oyj-nettisivuille-t', 'https://keskustelu.suomi24.fi/t/17816402/vat-teleoperaattori-emdnaem-oyjn-asiakaskokemu', 'https://keskustelu.suomi24.fi/t/17799626/emdnaem-testi---miten-tieta', 'https://keskustelu.suomi24.fi/t/17753098/minuakin-emdnaem-huijasi-', 'https://keskustelu.suomi24.fi/t/17748353/emdnaem-virus-pesake', 'https://keskustelu.suomi24.fi/t/17747680/emdnaem-oikea-virus-pesa-pu', 'https://keskustelu.suomi24.fi/t/17880563/onko-nain-emdnaem-prepaid', 'https://keskustelu.suomi24.fi/t

In [17]:
allComments = pd.DataFrame()
for df in allDataFrames.values():
    comments = extractComments(df)
    allComments = pd.concat([allComments, comments])

allComments.sort_values(by="replyCount", inplace=True, ascending=False) # sort by commentCount (popularity)


succesfully extracted 33 rows of data from 9 threads.
succesfully extracted 47 rows of data from 7 threads.


In [18]:
print(allComments)

                                                                        time  \
url                                                                            
https://keskustelu.suomi24.fi/t/17812132/miten-...  2023-04-05T12:52:41.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-02-15T13:14:13.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-03-08T07:37:30.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-02-17T19:08:39.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-02-16T11:53:14.000Z   
...                                                                      ...   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-02-16T08:23:42.000Z   
https://keskustelu.suomi24.fi/t/17816402/mitka-...  2023-05-05T20:42:42.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-03-08T18:52:55.000Z   
https://keskustelu.suomi24.fi/t/17746121/elisa-...  2023-02-17T07:50:16.000Z   
https://keskustelu.suomi24.fi/t/17772514