This notebook will look at the domains that are present in the reddit dataset and classify those as either 
- 0 scam or irrelevant domain (e.g. bit.ly, goo.gl, etc)
- 1 generic social media domain (youtube, twitter, facebook, pinterest, instagram, etc)
- 2 relevant news site (nytimes, wsj, cnn, phys, etc)
- 3 science repository (direct link to paper like doi, arxiv, pubmed, etc)

It will therefore use gpt-4o-mini to classify the domains.

In [1]:
import os
from enum import Enum

import pandas as pd
from IPython.core.debugger import prompt
from tqdm import tqdm
import numpy as np
from openai import OpenAI
import json
import seaborn as sns

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [2]:
df = pd.read_csv("./data/evaluate r_science post quality - domains.csv")
df.head()


Unnamed: 0,domain,label,count,relative_proportion,cumsum
0,youtube.com,0.0,12970,0.04681,0.04681
1,nature.com,1.0,10332,0.03729,0.0841
2,eurekalert.org,2.0,6527,0.023557,0.107657
3,youtu.be,0.0,5211,0.018807,0.126464
4,self.science,0.0,4001,0.01444,0.140904


In [3]:
# enum of the labels
class DomainLabel(Enum):
    UNLABELED = "unlabeled"
    FAILED = "failed"
    LESS_THAN_2 = "less_than_2"
    UNKNOWN = "unknown"
    SCAM = "scam"
    SOCIAL_MEDIA = "social_media"
    NEWS = "news"
    SCIENCE = "scientific"
    REPO = "repo"
    INDECISIVE = "indecisive"
labels = {
    DomainLabel.UNLABELED.value: -99,
    DomainLabel.FAILED.value: -98,
    DomainLabel.LESS_THAN_2.value: -2,
    DomainLabel.UNKNOWN.value: -1,
    DomainLabel.SCAM.value: 0,
    DomainLabel.SOCIAL_MEDIA.value: 1,
    DomainLabel.NEWS.value: 2,
    DomainLabel.SCIENCE.value: 3,
    DomainLabel.REPO.value: 4,
    DomainLabel.INDECISIVE.value: -3
}


In [4]:
# stripping the domain to the last two parts 
# Ended up not using it, as this would remove the subdomain, which might be important for the classification
# Furthermore, this could introduce further complications in downstream tasks
def strip(x:str):
    split = x.split(".")
    if len(split) <= 2:
        return x
    if split[0] in ["www", "m", "i", "l", "mobile", "link"] or split[0] == "" or len(split[0]) == 1:
        split = split[1:]
    return ".".join(split)

df['domain_stripped'] = df['domain'].apply(strip)

In [5]:
df[df["domain_stripped"] != df["domain"]]["domain"]

30       link.springer.com
136          m.youtube.com
184             m.phys.org
267            i.imgur.com
392             m.pnas.org
               ...        
30276       i.makeagif.com
30277        i.insider.com
30278           i.imgtc.ws
30279        i.ebayimg.com
30280         i.chzbgr.com
Name: domain, Length: 212, dtype: object

In [7]:
prompt = f"""
Please classify the following domains as either:
"scam":  scam or irrelevant domain (e.g. bit.ly, goo.gl, etc)
"social_media": generic social media domain (youtube, twitter, facebook, pinterest, instagram, etc)
"news": relevant news site (nytimes, wsj, cnn, phys, etc)
"scientific": relevant science site (sciencedaily, phys, nature, etc) and university sites ending in .edu that are not repositories fall under this category
"repo": science repository (direct link to paper like doi, arxiv, pubmed, etc)
"unknown": if unsure, please classify as "unknown".
The output should be a dictionary with the domain as key and the rating as value, like the following:
{{
    "bit.ly": "scam",
    "youtube.com": "social_media",
    "nytimes.com": "news",
    "sciencedaily.com": "scientific",
    "doi.org": "repo",
     }}
Please stick exactly to the labels provided above. If you are unsure, please classify as "unknown".
"""

In [8]:
def classify_domains(domains: list, client: OpenAI, i: int) -> list:
    for tries in range(3): 
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": prompt
                },
                {"role": "user",
                 "content": f"Domains: {str(domains)}",
                 },
            ],
            model="gpt-4o-mini",
            temperature=0.0,
        )
        output = response.choices[0].message.content
        dictio = json.loads(output)
        failed = False
        for domain in domains:
            if domain not in dictio or dictio[domain] not in [label.value for label in DomainLabel]:
                print("Failed to get a valid response from the model for i = ", i, " and domain: ", domain)
                failed = True
                continue
        if not failed:
            return [dictio[domain] for domain in domains]
    print("Failed to get a valid response from the model for i = ", i, " and domains: ", domains, " with response: ", dictio)
    return [DomainLabel.FAILED.value]*len(domains)

In [9]:
# Init the column which will be used to classify the domains with Unlabeled
column_name = "domain_rating_2"
if column_name not in df.columns:
    df[column_name] = DomainLabel.UNLABELED.value
else: 
    print("Column already present")
    
# If the domain has only one occurrence, we can classify it directly as less than 2
df.loc[df["count"] <= 1, column_name] = DomainLabel.LESS_THAN_2.value 
left_over = len(df["domain"]) - len(df[df[column_name] == DomainLabel.LESS_THAN_2.value])
print("Left over to be classified: ", left_over)

Column already present
Left over to be classified:  11163


In [13]:
# Loop to classify left over domains
context = 2
for i in tqdm(range(0, left_over, context)):
    if bool(df[column_name].iloc[i] == DomainLabel.UNLABELED.value) or bool(df[column_name].iloc[i] == DomainLabel.FAILED.value):
        domains = df["domain"].iloc[i:i+context].tolist()
        ratings = classify_domains(domains, client, i)
        df.loc[i:i+context -1, column_name] = ratings
    #else:
        #print("Skipping ", i, " as it is already classified")
df.loc[df["count"] <= 1, column_name] = DomainLabel.LESS_THAN_2.value # reclassify the less than 2 might be overwritten
df[column_name].value_counts()

100%|██████████| 5582/5582 [00:05<00:00, 1105.01it/s] 


domain_rating_2
less_than_2     19196
unknown          4490
scam             2898
scientific       1673
news             1411
repo              535
social_media      156
Name: count, dtype: int64

In [14]:
# In case of failures, first retry with smaller context size (need to be devider of 50)
# If a domain continues to be classified as failed, we can classify it manually
#df.loc[df["domain"] == "tamilastronomy.in", column_name] = DomainLabel.SCAM.value
#df.loc[df["domain"] == "newswire.ca", column_name] = DomainLabel.NEWS.value
df[column_name].value_counts()
df.head(25)

Unnamed: 0.1,Unnamed: 0,domain,label,count,relative_proportion,cumsum,domain_rating_0,domain_rating_1,domain_rating_2
0,0,youtube.com,0.0,12970,0.04681,0.04681,social_media,social_media,social_media
1,1,nature.com,1.0,10332,0.03729,0.0841,scientific,scientific,scientific
2,2,eurekalert.org,2.0,6527,0.023557,0.107657,news,news,news
3,3,youtu.be,0.0,5211,0.018807,0.126464,social_media,social_media,social_media
4,4,self.science,0.0,4001,0.01444,0.140904,unknown,unknown,unknown
5,5,sciencedirect.com,1.0,3737,0.013487,0.154391,scientific,scientific,scientific
6,6,sciencedaily.com,2.0,3317,0.011971,0.166363,scientific,scientific,scientific
7,7,psypost.org,2.0,3253,0.011741,0.178103,news,news,news
8,8,phys.org,2.0,3169,0.011437,0.189541,scientific,scientific,scientific
9,9,theguardian.com,2.0,3085,0.011134,0.200675,news,news,news


In [17]:
# voting mechanism to get the final classification
# If all are different classify as indecisive

df["label_voting_lm"] = df[["domain_rating_0", "domain_rating_1", "domain_rating_2"]].mode(axis=1)[0]
df.loc[(df["domain_rating_0"] != df["domain_rating_1"]) & (df["domain_rating_0"] != df["domain_rating_2"]) & (df["domain_rating_1"] != df["domain_rating_2"]), "label_voting_lm"] = DomainLabel.INDECISIVE.value

InvalidIndexError: (slice(100, 125, None), ['domain', 'label', 'count', 'domain_rating_0', 'domain_rating_1', 'domain_rating_2', 'label_voting_lm'])

In [21]:
#df.loc[100:125, ["domain", "label", "count", "domain_rating_0", "domain_rating_1", "domain_rating_2", "label_voting_lm"]]
df["domain_rating_0"].value_counts(), df["domain_rating_1"].value_counts(), df["domain_rating_2"].value_counts()

(domain_rating_0
 less_than_2     19196
 unknown          4431
 scam             2972
 scientific       1666
 news             1408
 repo              531
 social_media      155
 Name: count, dtype: int64,
 domain_rating_1
 less_than_2     19196
 unknown          4486
 scam             2918
 scientific       1646
 news             1414
 repo              544
 social_media      155
 Name: count, dtype: int64,
 domain_rating_2
 less_than_2     19196
 unknown          4490
 scam             2898
 scientific       1673
 news             1411
 repo              535
 social_media      156
 Name: count, dtype: int64)

In [22]:
# Don't forget to save the file!
df.to_csv("./data/evaluate r_science post quality - domains_lm labeled.csv", index=True)

# Evaluation

In [14]:
df[df["label"] == 0][["domain", "label", "domain_rating_0"]]

Unnamed: 0,domain,label,domain_rating_0
0,youtube.com,0.0,social_media
3,youtu.be,0.0,social_media
4,self.science,0.0,unknown
11,pinterest.com,0.0,social_media
31,mightyviral.com,0.0,scam
35,reddit.com,0.0,social_media
36,moviesdost.com,0.0,scam
41,hit2k.com,0.0,scam
49,google.com,0.0,scam
54,hiwebex.com,0.0,scam


In [15]:
df[df["label"] == 1][["domain", "label", "domain_rating_0"]]


Unnamed: 0,domain,label,domain_rating_0
1,nature.com,1.0,scientific
5,sciencedirect.com,1.0,scientific
14,pnas.org,1.0,scientific
16,onlinelibrary.wiley.com,1.0,scientific
20,ncbi.nlm.nih.gov,1.0,repo
21,journals.plos.org,1.0,scientific
22,cell.com,1.0,scientific
26,academic.oup.com,1.0,scientific
28,jamanetwork.com,1.0,scientific
29,doi.org,1.0,repo


In [20]:
print(df[df["label"] == 2]["domain_rating_0"].value_counts()) 
df[df["label"] == 3][["domain", "label", "domain_rating_0"]]

domain_rating_0
news          83
scientific    23
unknown        9
scam           3
Name: count, dtype: int64


Unnamed: 0,domain,label,domain_rating_0
23,science.sciencemag.org,3.0,repo
51,science.org,3.0,scientific
61,nejm.org,3.0,repo
116,cdc.gov,3.0,scientific
168,aeaweb.org,3.0,scientific


In [36]:
df[["domain", "label", "domain_rating_0", "domain_rating_1", "domain_rating_2"]].

Unnamed: 0,domain,label,domain_rating_0,domain_rating_1,domain_rating_2
0,youtube.com,0.0,social_media,social_media,social_media
1,nature.com,1.0,scientific,scientific,scientific
2,eurekalert.org,2.0,scientific,news,news
3,youtu.be,0.0,social_media,social_media,social_media
4,self.science,0.0,unknown,unknown,unknown
...,...,...,...,...,...
30354,hydromo.in,,less_than_2,less_than_2,less_than_2
30355,hydrogenwatermaker.net,,less_than_2,less_than_2,less_than_2
30356,hydrogengirls.com,,less_than_2,less_than_2,less_than_2
30357,hydrogen-central.com,,less_than_2,less_than_2,less_than_2


In [38]:
# Evaluate how consistent the ratings of chat are
df[~(df["domain_rating_0"] == df["domain_rating_1"] == df["domain_rating_2"])][df["domain_rating_2"] != DomainLabel.UNLABELED.value][["domain", "label", "count", "domain_rating_0", "domain_rating_1", "domain_rating_2"]]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [42]:
# get all rows that are not identical in row domain_rating_0 and domain_rating_1 and domain_rating_2
df[(df['domain_rating_0'] != df['domain_rating_1']) |
                 (df['domain_rating_0'] != df['domain_rating_2']) |
                 (df['domain_rating_1'] != df['domain_rating_2'])][df["domain_rating_2"] != DomainLabel.UNLABELED.value][["domain", "label", "count", "domain_rating_0", "domain_rating_1", "domain_rating_2"]]

  df[(df['domain_rating_0'] != df['domain_rating_1']) |


Unnamed: 0,domain,label,count,domain_rating_0,domain_rating_1,domain_rating_2
2,eurekalert.org,2.0,6527,scientific,news,news
21,journals.plos.org,1.0,1559,scientific,scientific,repo
23,science.sciencemag.org,3.0,1507,repo,scientific,repo
28,jamanetwork.com,1.0,1255,scientific,scientific,news
37,researchgate.net,1.0,969,repo,unknown,unknown
38,advances.sciencemag.org,1.0,965,repo,scientific,repo
93,scitechdaily.com,2.0,394,scientific,news,scientific
144,relationshipengineeringblog.wordpress.com,0.0,241,unknown,scam,scam
153,yacinews.com,2.0,221,scam,unknown,unknown
206,collective-spark.xyz,,158,unknown,scam,scam
