# Crawler

In [None]:
import requests
from bs4 import BeautifulSoup
from typing import Any, cast, List
from bs4.element import Tag
import csv
from datetime import datetime
def print_property(obj: Any):
    print('type:', type(obj))
    print('dir:', dir(obj))

# get domain from url
def get_domain(url: str) -> str:
    return url.split('/')[2]

def get_main_domain(url: str) -> str:
    domain = get_domain(url)
    return '.'.join(domain.split('.')[-2:])

def handle_detik(url: str) -> [str, str]:
    # strip url query
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # title is in h1.detail__title
    title = soup.find('h1', class_='detail__title').text.strip()
    htmlClass = 'detail__body-text itp_bodycontent'
    # only get first level of p
    paragraphs = list(filter(lambda p: len(p.text) > 0,
                       cast(Tag, soup.find('div', class_=htmlClass)).find_all(['p', 'ul', 'ol'], recursive=False)
                       ))
    flattenedText = ""
    for p in paragraphs:
        # ignore p if
        # 1. p has any attribute
        # 2. p starts with \r \n or \t
        text = p.text.strip()
        if (len(p.attrs) > 0
            or text.startswith('\r')
            or text.startswith('\n')
            or text.startswith('\t')
            or text.startswith('Simak selengkapnya')
            or p.select_one('div.detail__multiple') is not None
            or p.select_one('div.sisip_video_ds') is not None
            or text.startswith('Simak Video')
            or text.startswith('Simak juga Video')
            or text.startswith('[Gambas')
            or text.startswith('Lihat juga Video')
            or text.startswith('Lihat Video')
            or text.startswith('Baca selengkapnya')
            or 'halaman selanjutnya' in text.lower()
            or 'halaman berikutnya' in text.lower()
        ):
            continue
        else:
            flattenedText += text + '\n\n'
    # button property dtr-act="button selanjutnya"
    next_button = soup.find('a', attrs={'dtr-act': 'button selanjutnya'})
    while next_button is not None:
        next_url = next_button['href']
        response = requests.get(next_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = list(filter(lambda p: len(p.text) > 0,
                           cast(Tag, soup.find('div', class_=htmlClass)).find_all(['p', 'ul', 'ol'], recursive=False)
                           ))
        for p in paragraphs:
            text = p.text.strip()
            if (len(p.attrs) > 0
                or text.startswith('\r')
                or text.startswith('\n')
                or text.startswith('\t')
                or text.startswith('Simak selengkapnya')
                or p.select_one('div.detail__multiple') is not None
                or p.select_one('div.sisip_video_ds') is not None
                or text.startswith('Simak Video')
                or text.startswith('Simak juga Video')
                or text.startswith('[Gambas')
                or text.startswith('Lihat juga Video')
                or text.startswith('Lihat Video')
                or text.startswith('Baca selengkapnya')
                or 'halaman selanjutnya' in text.lower()
                or 'halaman berikutnya' in text.lower()
            ):
                continue
            else:
                flattenedText += text + '\n\n'
        next_button = soup.find('a', attrs={'dtr-act': 'button selanjutnya'})
    return [title, flattenedText]

def handle_kompas(url: str) -> [str, str]:
    url = url.split('?')[0]
    response = requests.get(url + '?page=all')
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1', class_='read__title').text.strip()
    cssSelector="div.read__content > .clearfix"
    # paragraphs = list(filter(lambda p: len(p.text) > 0,
    #                    cast(Tag, soup.select_one(cssSelector)).find_all('p', recursive=False)))
    # p, ul, and ol
    paragraphs = list(filter(lambda p: len(p.text) > 0,
                       cast(Tag, soup.select_one(cssSelector)).find_all(['p', 'ul', 'ol'], recursive=False)))
    flattenedText = ""
    for p in paragraphs:
        text = p.text.strip()
        if (
            len(p.attrs) > 0 or text.startswith('\r') or text.startswith('\n') or text.startswith('\t')
            or text.startswith('Baca juga')
        ):
            continue
        else:
            flattenedText += text + '\n\n'
    if len(flattenedText.strip().strip('\n').strip('\t').strip('\r')) == 0:
        raise Exception('Empty content')
    return [title, flattenedText]

def cnn_indonesia_handler(url: str) -> [str, str]:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    titleCssSelector = 'h1.mb-2.leading-9'
    title = soup.select_one(titleCssSelector).text.strip()
    contentCssSelector = 'div.detail-text'
    paragraphs = list(filter(lambda p: len(p.text) > 0,
                       cast(Tag, soup.select_one(contentCssSelector)).find_all(['p', 'ul', 'ol'], recursive=False)))
    flattenedText = ""
    for p in paragraphs:
        text = p.text.strip()
        if (
            len(p.attrs) > 0 or text.startswith('\r') or text.startswith('\n') or text.startswith('\t')
            or text.startswith('[Gambas')
            ):
            continue
        else:
            flattenedText += text + '\n\n'
    if len(flattenedText.strip().strip('\n').strip('\t').strip('\r')) == 0:
        raise Exception('Empty content')
    return [title, flattenedText]

def generic_handler(url: str) -> str:
    # get text from body p
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = list(filter(lambda p: len(p.text.strip()) > 100 and len(p.text.split()) > 7,
                          cast(Tag, soup.body).find_all('p')))
    flattenedText = ""
    for p in paragraphs:
        if p.text.startswith('\r') or p.text.startswith('\n') or p.text.startswith('\t'):
            continue
        else:
            flattenedText += p.text + '\n\n'

    if len(flattenedText.strip().strip('\n').strip('\t').strip('\r')) == 0:
        raise Exception('Empty content')
    return flattenedText

def csv_writer(filename: str, data: List[any]) -> bool:
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['title', 'content'])
        for row in data:
            writer.writerow(row)

def csv_writer_discovery(filename: str, data: List[any]):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['title', 'author', 'published', 'url', 'content'])
        for row in data:
            writer.writerow(row)

if __name__ == '__main__':
    # Manual Input
    print("WARNING: please install BeautifulSoup4 and requests (pip) first")
    choice = input('Automatic? (y/n): ')
    if choice.lower() == 'n':
        url = input('URL: ')

        # if url not start with http:// or https://, add http://
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://' + url

        main_domain = get_main_domain(url)
        if main_domain == 'detik.com':
            print()
            print(handle_detik(url))
        elif main_domain == 'kompas.com':
            print()
            print(handle_kompas(url))
        elif main_domain == 'cnnindonesia.com':
            print()
            print(cnn_indonesia_handler(url))
        else:
            print("Using generic handler")
            print(generic_handler(url))
    elif choice.lower() == 'y':
        print("Pick Portal:")
        print("1. Detik")
        print("2. Kompas")
        print("3. CNN Indonesia")
        print("4. Search By Topic (Google)")
        choice = input("Your choice: ")
        if choice == '1':
            how_many = input("How many news do you want to pick? ")
            homepage = requests.get('https://news.detik.com/indeks')
            soup = BeautifulSoup(homepage.text, 'html.parser')
            # list-content
            articleSelector = '.list-content > article h3 > a'
            nextPageSelector = 'a.pagination__item'
            nextPageText = 'Next'
            contents = []
            nextPage = soup.find('a', string=nextPageText)
            while len(contents) < int(how_many) and nextPage is not None:
                links = list(
                    map ( lambda a: a['href'],
                    filter(lambda a: a['href'].startswith('https://news.detik.com/')
                           , soup.select(articleSelector))
                    )
                )
                for link in links:
                    if len(contents) >= int(how_many):
                        break
                    try:
                        contents.append(handle_detik(link))
                    except Exception as e:
                        print(e)
                        print(f'Error on {link}')
                if len(contents) >= int(how_many):
                    break
                nextPage = soup.find('a', string=nextPageText)
                nextPageUrl = nextPage['href']
                nextPageResponse = requests.get(nextPageUrl)
                soup = BeautifulSoup(nextPageResponse.text, 'html.parser')

            # write to csv
            currentTimestamp = datetime.now().strftime("%Y%m%d%H%M%S")
            filename = f'detik_{how_many}_{currentTimestamp}.csv'
            csv_writer(filename, contents)
            print(f'File {filename} created')
        elif choice == '2':
            how_many = input("How many news do you want to pick? ")
            homepage = requests.get('https://indeks.kompas.com/')
            soup = BeautifulSoup(homepage.text, 'html.parser')
            # list-content
            articleSelector = '.article__list__title a'
            nextPageSelector = 'a.paging__link.paging__link--next'
            nextPage = soup.select_one(nextPageSelector)
            contents = []
            while len(contents) < int(how_many) and nextPage is not None:
                links = list(
                    map ( lambda a: a['href'],
                        soup.select(articleSelector)
                    )
                )
                for link in links:
                    if len(contents) >= int(how_many):
                        break
                    try:
                        contents.append(handle_kompas(link))
                    except Exception as e:
                        print(e)
                        print(f'Error on {link}')
                if len(contents) >= int(how_many):
                    break
                nextPageUrl = nextPage['href']
                nextPageResponse = requests.get(nextPageUrl)
                soup = BeautifulSoup(nextPageResponse.text, 'html.parser')
                nextPage = soup.select_one(nextPageSelector)
            currentTimestamp = datetime.now().strftime("%Y%m%d%H%M%S")
            filename = f'kompas_{how_many}_{currentTimestamp}.csv'
            csv_writer(filename, contents)
            print(f'File {filename} created')
        elif choice == '3':
            how_many = input("How many news do you want to pick? ")
            homepage = requests.get('https://www.cnnindonesia.com/indeks')
            soup = BeautifulSoup(homepage.text, 'html.parser')
            # nhl-list
            articleSelector = 'article > a'
            nextPageSelector = 'a[dtr-sec="halaman selanjutnya"][dtr-act="halaman selanjutnya"]'
            nextPage = soup.select_one(nextPageSelector)
            contents = []
            while len(contents) < int(how_many) and nextPage is not None:
                links = list(
                    map ( lambda a: a['href'],
                    filter(lambda a: a['href'].startswith('https://www.cnnindonesia.com/'), soup.select(articleSelector))
                    )
                )
                for link in links:
                    if len(contents) >= int(how_many):
                        break
                    try:
                        contents.append(cnn_indonesia_handler(link))
                    except Exception as e:
                        print(e)
                        print(f'Error on {link}')
                nextPageUrl = nextPage['href']
                nextPageResponse = requests.get(nextPageUrl)
                soup = BeautifulSoup(nextPageResponse.text, 'html.parser')
            currentTimestamp = datetime.now().strftime("%Y%m%d%H%M%S")
            filename = f'cnn_indonesia_{how_many}_{currentTimestamp}.csv'
            csv_writer(filename, contents)
            print(f'File {filename} created')
        elif choice == '4':
            how_many = input("How many news do you want to pick? ")
            topic = input("Topic: ")
            homepage = requests.get(f'https://www.google.com/search?q={topic}&tbm=nws')
            soup = BeautifulSoup(homepage.text, 'html.parser')
            # select anchor with href that starts with /url?q=
            articleSelector = 'div#main a[href^="/url?q="]'
            # select footer anchor with href that starts with /search?q=
            nextPageSelector = 'footer a[href^="/search?q="]'
            nextPage = soup.select_one(nextPageSelector)
            contents = []
            while len(contents) < int(how_many) and nextPage is not None:
                anchors = list(
                    filter(lambda a: a['href'].startswith('/url?q=') and 'google.com' not in a['href'], soup.select(articleSelector))
                    )
                for anchor in anchors:
                    if len(contents) >= int(how_many):
                        break
                    divorspans = anchor.parent.select('div, span')
                    divorspans = list(filter(lambda d: d.find(string=True, recursive=False) is not None, divorspans))
                    try:
                        url = anchor['href'].split('&')[0][7:]
                        title = divorspans[0].text
                        author = divorspans[1].text
                        published = divorspans[3].text
                    except Exception as e:
                        with open(f'google_{topic}.html', 'w', encoding='utf-8') as f:
                            f.write(soup.prettify())
                        print(e)
                        print(f'Error on {anchor}')
                    try:
                        content = generic_handler(url)
                        contents.append([title, author, published, url, content])
                    except Exception as e:
                        print(e)
                        print(f'Error on {url}')
                nextPageUrl = 'https://www.google.com' + nextPage['href']
                nextPageResponse = requests.get(nextPageUrl)
                soup = BeautifulSoup(nextPageResponse.text, 'html.parser')
                nextPage = soup.select_one(nextPageSelector)
            currentTimestamp = datetime.now().strftime("%Y%m%d%H%M%S")
            filename = f'google_{topic}_{how_many}_{currentTimestamp}.csv'
            csv_writer_discovery(filename, contents)
            print(f'File {filename} created')
        else:
            print("Invalid input")
    else:
        print("Invalid input")

Pick Portal:
1. Detik
2. Kompas
3. CNN Indonesia
4. Search By Topic (Google)
'NoneType' object has no attribute 'text'
Error on https://jeo.kompas.com/kisah-sepiring-nasi-dan-pelestari-beras-lokal-di-jawa
File kompas_1000_20231206101000.csv created


# Subjectivity: Rotten IMDB

In [None]:
!gdown --folder "1I5mEOzZfQvCjxrXmQ0JomVsvHgeeHXl_"
!pip install deep-translator

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
import os
from google.colab import files

In [None]:
def readlines(filepath, translator=GoogleTranslator('en', 'id')):
  with open(filepath, encoding="ISO-8859-1") as f:
    lines = [line.rstrip() for line in f]

  return translator.translate_batch(lines)

## Main

In [None]:
obj = readlines("Rotten IMDB/obj.5000.txt")
sub = readlines("Rotten IMDB/sub.5000.txt")

In [None]:
data = []
for line in obj:
  data.append((line, 0))

for line in sub:
  data.append((line, 1))

In [None]:
header = ['text', 'is_subjective']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-Rotten IMDB.csv', index=False)
files.download('subjectivity-Rotten IMDB.csv')

## Alternative

In [None]:
obj = readlines("Rotten IMDB/obj.5000.txt")
data = [(line, 0) for line in obj]
header = ['text', 'is_subjective']
df = pd.DataFrame(data, columns=header)
df.to_csv('subjectivity-Rotten IMDB-obj.csv', index=False)
files.download('subjectivity-Rotten IMDB-obj.csv')

In [None]:
sub = readlines("Rotten IMDB/sub.5000.txt")
data = [(line, 1) for line in sub]
header = ['text', 'is_subjective']
df = pd.DataFrame(data, columns=header)
df.to_csv('subjectivity-Rotten IMDB-sub.csv', index=False)
files.download('subjectivity-Rotten IMDB-sub.csv')

In [None]:
sub = pd.read_csv("subjectivity-Rotten IMDB-sub.csv")
obj = pd.read_csv("subjectivity-Rotten IMDB-obj.csv")
df = pd.concat([sub, obj]).sample(frac=1).reset_index(drop=True)

In [None]:
df.to_csv('subjectivity-Rotten IMDB.csv', index=False)
files.download('subjectivity-Rotten IMDB.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Subjectivity: MPQA

In [None]:
!gdown --folder "1ONyUksWzuZ-C2PM86kwYoPegQXHNCah4"
!pip install deep-translator

Retrieving folder list
Processing file 1sY2CfTZoYn9AgvX2c9Lzn32Uwz1hPTG_ CoNNL11SenseAnnotations.txt
Processing file 1eELZiLbe8Jdfu13WCKY-3lSDls58Vyee EMNLP09SenseAnnotations_SENSEVAL1.txt
Processing file 1AuOD-e8jbo6eIkgKYXqj21w8hdcltxaD EMNLP09SenseAnnotations_SENSEVAL2.txt
Processing file 1IflrZ5g0dcRPpb15Z9Cu_OouSspB71Q8 EMNLP09SenseAnnotations_SENSEVAL3.txt
Processing file 1ha1DXnGOPOcZ1ChWR-9uiSLnBH-yDqSz goldstandard.total.acl06.txt
Processing file 1f0Fs5-iSOqvCrWmii-RBBhe6SYY9Dp-- mixed.txt
Retrieving folder 1g68sb6X6lDG5jJrV15m7W6FHxE0JzH2u Weak/Strong
Processing file 1CXzNM5Jxe8RkkbK-6U8O6BOEBHduG0aZ strong+weak.txt
Processing file 1LJZ-Dk5tS-5wNKgEKFbnp_zeEBKaSFOx strongsubj.txt
Processing file 11RSEvKBWaU1Jtyj5_LEwUiafEsg2bDMm subjclueslen1-HLTEMNLP05.tff.txt
Processing file 1cCw0DUF4iKBb9u5qZDTI46miHrxdyGy6 weaksubj.txt
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1s

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
import os
from google.colab import files

## Prep

### 1

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/CoNNL11SenseAnnotations.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label = words[0], words[-1]
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "subj":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "obj":
        count = obj.get(words, 0)
        obj[words] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-CoNNL.csv', index=False)
files.download('subjectivity-MPQA-CoNNL.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###2

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/EMNLP09SenseAnnotations_SENSEVAL1.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label= words[0], words[-1]
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "subj":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "obj":
        count = obj.get(words, 0)
        obj[words] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-EMNLP-SENSEVAL1.csv', index=False)
files.download('subjectivity-MPQA-EMNLP-SENSEVAL1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###3

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/EMNLP09SenseAnnotations_SENSEVAL2.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label, alt = words[0], words[-1], words[-2]
      alt = alt.split(":")[3]
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "subj":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "obj":
        count = obj.get(words, 0)
        obj[words] = count + 1
      else:
          count = obj.get(alt, 0)
          obj[alt] = count + 1
          count = sub.get(alt, 0)
          sub[alt] = count + 1

      if len(alt) != 0:
        alt = translator.translate(alt)
        list_of_words.add(alt)

        if label == "subj":
          count = sub.get(alt, 0)
          sub[alt] = count + 1
        elif label == "obj":
          count = obj.get(alt, 0)
          obj[alt] = count + 1
        else:
          count = obj.get(alt, 0)
          obj[alt] = count + 1
          count = sub.get(alt, 0)
          sub[alt] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-EMNLP-SENSEVAL2.csv', index=False)
files.download('subjectivity-MPQA-EMNLP-SENSEVAL2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###4

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/EMNLP09SenseAnnotations_SENSEVAL3.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label, alt = words[0], words[-1], words[-2]
      alt = alt.split(":")
      alt = alt[3] if len(alt) > 1 else ''
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "subj":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "obj":
        count = obj.get(words, 0)
        obj[words] = count + 1
      else:
          count = obj.get(alt, 0)
          obj[alt] = count + 1
          count = sub.get(alt, 0)
          sub[alt] = count + 1

      if len(alt) != 0:
        alt = translator.translate(alt)
        list_of_words.add(alt)

        if label == "subj":
          count = sub.get(alt, 0)
          sub[alt] = count + 1
        elif label == "obj":
          count = obj.get(alt, 0)
          obj[alt] = count + 1
        else:
          count = obj.get(alt, 0)
          obj[alt] = count + 1
          count = sub.get(alt, 0)
          sub[alt] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-EMNLP-SENSEVAL3.csv', index=False)
files.download('subjectivity-MPQA-EMNLP-SENSEVAL3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###5

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/goldstandard.total.acl06.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label = words[1], words[3]
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "S":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "O":
        count = obj.get(words, 0)
        obj[words] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-goldstandard.csv', index=False)
files.download('subjectivity-MPQA-goldstandard.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###6

In [None]:
translator = GoogleTranslator('en', 'id')

with open("/content/MPQA/mixed.txt", encoding="ISO-8859-1") as f:
    list_of_words = set()
    sub = dict()
    obj = dict()

    for line in f:
      words = line.rstrip().split(" ")
      words, label = words[0], words[1]
      words = words.split("#")[0]
      words = translator.translate(words)
      list_of_words.add(words)

      if label == "S":
        count = sub.get(words, 0)
        sub[words] = count + 1
      elif label == "O":
        count = obj.get(words, 0)
        obj[words] = count + 1

In [None]:
data = []
for word in list_of_words:
  sub_count = sub.get(word, 0)
  obj_count = obj.get(word, 0)
  total = sub_count + obj_count
  data.append((word, sub_count/total, obj_count/total))

In [None]:
header = ['text', 'subjectivity_score', 'objectivity_score']
df = pd.DataFrame(data, columns=header).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-mixed.csv', index=False)
files.download('subjectivity-MPQA-mixed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Merge

In [None]:
df1 = pd.read_csv('subjectivity-MPQA-CoNNL.csv')
df2 = pd.read_csv('subjectivity-MPQA-EMNLP-SENSEVAL1.csv')
df3 = pd.read_csv('subjectivity-MPQA-EMNLP-SENSEVAL2.csv')
df4 = pd.read_csv('subjectivity-MPQA-EMNLP-SENSEVAL3.csv')
df5 = pd.read_csv('subjectivity-MPQA-goldstandard.csv')
df6 = pd.read_csv('subjectivity-MPQA-mixed.csv')

In [None]:
df = pd.concat([df1, df2, df3, df4, df5, df6]).sample(frac=1).reset_index(drop=True)
df.to_csv('subjectivity-MPQA-All.csv', index=False)
files.download('subjectivity-MPQA-All.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
os.getcwd()

'/content'

# Subjectivity: MPQA to News

In [None]:
!pip install --upgrade --no-cache-dir gdown



In [None]:
!gdown --folder "13qN0CC2r3LxqZvZ8DmDMWJ_tWzOKcHO5"
!gdown --folder "1cszZtjGiWoS5kJEU3cF-VZaXDYMiV7KR"

Retrieving folder list
Processing file 1iGZLWY0Y0fdNRmSjKUm44fwoQXbJpHt9 cnn_indonesia_100_20231205231220.csv
Processing file 1dxt8rn3KLRJj4zOQEcD_GXMl_SH0oacsya41TIGR6bM cnn_indonesia_1000_20231206070128
Processing file 1c_8-LWIVH2okPnZTTfEkzLLiIHlgyhCq cnn_indonesia_1000_20231206070128.csv
Processing file 11wPmtc-mUTmexi58Lt7b1dQxmOUDF3Lp crawler.py
Processing file 1SCCZO6F07D_mfmsvQwTvcdCXXGo_4Z-3 detik_100_20231205230955.csv
Processing file 1iqNVKpY0aoM35BMYV7lfPSRshpWHkKm7 detik_1000_20231206041749.csv
Processing file 1wy_19VH8Ony2pWqdZEb7OE3ZXEpW7mIj google_Doki Doki Literature Club_100_20231205231947.csv
Processing file 1cwTIQPqo7jbMTSxQ4jCh_Xwn0KD3MwTw google_pemilu 2024_100_20231205232258.csv
Processing file 1SagKW-hi27NJyEkaWBedRDzhZoiAuJc0 kompas_100_20231205231152.csv
Processing file 14zkq-Vvsvr8ENYDbLUSWYrGNtOuwWr3L kompas_1000_20231206101000.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Access denied with the foll

In [None]:
import pandas as pd
from google.colab import files
import string
import unicodedata
import numpy as np

In [None]:
df_detik = pd.read_csv('/content/Scrap Data/detik_1000_20231206041749.csv')
df_cnn = pd.read_csv('/content/Scrap Data/cnn_indonesia_1000_20231206070128.csv')
df_kompas = pd.read_csv('/content/Scrap Data/kompas_1000_20231206101000.csv')
df_words = pd.read_csv('/content/Clean Dataset/subjectivity-MPQA-All.csv')

In [None]:
df_news = pd.concat([df_detik, df_cnn, df_kompas]).sample(frac=1).reset_index(drop=True)

In [None]:
def preprocess(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ")
    text = text.replace('\\',"").replace('\n\n'," ").replace("KOMPAS.com -", "")
    return text

In [None]:
df_news['content'] = df_news['content'].astype(str).apply(preprocess)

In [None]:
is_subjective = []
for news in df_news['content']:
  obj = 0
  sub = 0

  for words in news.split(' '):
    words = words.translate(str.maketrans("","",string.punctuation))
    if words in df_words['text'].tolist():
      obj += df_words[df_words['text'] == words]['objectivity_score'].iloc[0]
      sub += df_words[df_words['text'] == words]['subjectivity_score'].iloc[0]

  if obj > sub:
    is_subjective.append(0)
  elif sub > obj:
    is_subjective.append(1)
  else:
    is_subjective.append(2)

In [None]:
from collections import Counter
print(Counter(is_subjective))

Counter({0: 2522, 1: 463, 2: 15})


In [None]:
df_news['is_subjective'] = is_subjective
df_news.drop(df_news[df_news['is_subjective'] == 2].index, inplace=True)

In [None]:
df_news

Unnamed: 0,title,content,is_subjective
0,Mendikbud Nadiem Ungkap Alasan Ranking Indones...,"Menteri Pendidikan, Kebudayaan, Riset, dan Te...",1
1,Pemkot Tangerang Raih Penghargaan Anindhita Wi...,Pemerintah Kota (Pemkot) Tangerang meraih peng...,0
2,Skor Literasi Membaca PISA 2022: Indonesia Tur...,Organisation for Economic Co-operation and De...,0
3,Kaki Ayah Britney Spears Diamputasi karena Inf...,"Ayah penyanyi Britney Spears, Jamie Spears (7...",0
4,Mahfud Gelar Rapat Bahas Polemik Penolakan Pen...,Menko Polhukam Mahfud Md akan menggelar rapat ...,0
...,...,...,...
2995,Lebih Dekat dengan Roadster Listrik MG Cyberster,"BANGKOK, KOMPAS.com – MG Cyberster adalah sebu...",0
2996,"Dapat Perawatan Gigi Gratis di RS Pelni, Peser...",KOMPAS.com – Seorang peserta Jaminan Kesehatan...,0
2997,"Perkuat Bisnis di Indonesia dan Australia, Ana...","JAKARTA, Perusahaan jasa pertambangan PT Buki...",0
2998,Rekomendasi Kopi Sesuai Zodiak dan Kepribadian...,Kita pasti sering merasa harus minum kopi tap...,1


In [None]:
indices_to_filter = df_news[df_news['is_subjective'] == 0].index
num_rows_to_keep = int(0.2 * len(df_news))
rows_to_keep = np.random.choice(indices_to_filter, size=num_rows_to_keep, replace=False)
rows_to_keep = np.append(rows_to_keep, df_news[df_news['is_subjective'] == 1].index)
df_news = df_news.loc[rows_to_keep].sample(frac=1).reset_index(drop=True)

In [None]:
df_news.to_csv('subjectivity-MPQA-All-News.csv', index=False)
files.download('subjectivity-MPQA-All-News.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Bias & Neutrality

### Prep

In [2]:
!gdown "1Xl5sYoSJP_AUvMh6YyMW-eLfLs8K3qhE"
!gdown "1Db1uzijokFCQcmj-4DvphA-Uzw7kHduR"
!gdown "12FxlXSDPvqdS7LPB_fYwP-ZYbIXh4-IZ"
!pip install deep-translator

Downloading...
From: https://drive.google.com/uc?id=1Xl5sYoSJP_AUvMh6YyMW-eLfLs8K3qhE
To: /content/final_labels_MBIC.csv
100% 8.53M/8.53M [00:00<00:00, 38.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Db1uzijokFCQcmj-4DvphA-Uzw7kHduR
To: /content/final_labels_SG1.csv
100% 706k/706k [00:00<00:00, 10.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=12FxlXSDPvqdS7LPB_fYwP-ZYbIXh4-IZ
To: /content/final_labels_SG2.csv
100% 1.46M/1.46M [00:00<00:00, 11.2MB/s]
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [3]:
import pandas as pd
from google.colab import files
import numpy as np

In [4]:
df_a = pd.read_csv('final_labels_SG1.csv', sep=';')
df_b = pd.read_csv('final_labels_SG2.csv', sep=';')
df_c = pd.read_csv('final_labels_MBIC.csv', sep=';')
full_df = pd.concat([df_a, df_b, df_c]).sample(frac=1).reset_index(drop=True)

In [5]:
df = full_df[['text', 'topic', 'type', 'label_bias']]
df.drop(df[df['label_bias'] == 'No agreement'].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['label_bias'] == 'No agreement'].index, inplace=True)


In [6]:
df['topic'].value_counts()

white-nationalism                        377
environment                              371
gender                                   364
student-debt                             360
sport                                    359
abortion                                 350
marriage-equality                        347
immigration                              346
vaccines                                 343
trump-presidency                         342
gun-control                              340
coronavirus                              339
middle-class                             336
elections-2020                           322
vaccine                                  299
black lives matter                       289
international-politics-and-world-news    241
taxes                                    228
universal health care                    220
gun control                              214
islam                                    207
blm                                      147
#metoo    

In [7]:
df.groupby(['topic', 'label_bias']).size()

topic                                  label_bias
#metoo                                 Biased          1
                                       Non-biased     28
abortion                               Biased        203
                                       Non-biased    147
black lives matter                     Biased        125
                                       Non-biased    164
blm                                    Biased          2
                                       Non-biased    145
coronavirus                            Biased        182
                                       Non-biased    157
elections-2020                         Biased        184
                                       Non-biased    138
environment                            Biased        194
                                       Non-biased    177
gender                                 Biased        172
                                       Non-biased    192
gun control                           

In [8]:
df.drop(df[df['topic'] == '#metoo'].index, inplace=True)
df.drop(df[df['topic'] == 'blm'].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['topic'] == '#metoo'].index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['topic'] == 'blm'].index, inplace=True)


In [9]:
df.groupby(['topic', 'type']).size()

topic                                  type  
abortion                               center     60
                                       left      148
                                       right     142
black lives matter                     center    137
                                       left       74
                                       right      78
coronavirus                            center     90
                                       left      105
                                       right     144
elections-2020                         center     93
                                       left      115
                                       right     114
environment                            center     59
                                       left      167
                                       right     145
gender                                 center     64
                                       left      168
                                       right     132


### Bias

In [10]:
bias_df = pd.DataFrame()

for topic in df['topic'].unique():
    topic_df = df[df['topic'] == topic]
    for is_biased in topic_df['type'].unique():
        subset_df = topic_df[topic_df['type'] == is_biased]

        balanced_subset_df = subset_df.sample(min(subset_df.shape[0], 20))
        bias_df = pd.concat([bias_df, balanced_subset_df])

bias_df.reset_index(drop=True, inplace=True)

In [11]:
bias_df = pd.get_dummies(bias_df, columns=['label_bias'])[['text', 'label_bias_Biased']]
bias_df.rename(columns={'label_bias_Biased': 'is_biased'}, inplace=True)

In [12]:
bias_df

Unnamed: 0,text,is_biased
0,"After winning his 15th major championship, the...",0
1,Coach Guerrier adds that the referee’s continu...,0
2,"Last week, another major advancement for women...",0
3,While each sport has its own rules with regard...,1
4,"Of course, social distancing will be inconceiv...",0
...,...,...
1075,Before sundown on Thursday around 150 proteste...,0
1076,A tanker truck drove through thousands of peop...,0
1077,The result is that at a time when the United S...,0
1078,The U.N. Human Rights Council on Friday condem...,0


In [13]:
bias_df.to_csv('bias-filtered-reduced.csv', index=False)
files.download('bias-filtered-reduced.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Neutrality

In [14]:
netr_df = pd.DataFrame()

for topic in df['topic'].unique():
    topic_df = df[df['topic'] == topic]
    for is_biased in topic_df['type'].unique():
        subset_df = topic_df[topic_df['type'] == is_biased]

        balanced_subset_df = subset_df.sample(min(subset_df.shape[0], 15))
        netr_df = pd.concat([netr_df, balanced_subset_df])

netr_df.reset_index(drop=True, inplace=True)

In [15]:
netr_df = pd.get_dummies(netr_df, columns=['type'])[['text', 'type_left', 'type_center', 'type_right']]
netr_df.rename(columns={'type_left': 'is_left', 'type_center': 'is_center', 'type_right': 'is_right'}, inplace=True)

In [16]:
netr_df.to_csv('netr-filtered-reduced.csv', index=False)
files.download('netr-filtered-reduced.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>