<a href="https://colab.research.google.com/github/trueconnor/BUS-Z-798/blob/main/Connor_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Get external files
!mkdir -p texts
!wget -q https://www.dropbox.com/s/5ibk0k4mibcq3q6/AussieTop100private.zip?dl=1 -O ./texts/AussieTop100private.zip
!unzip -qq -n -d ./texts/ ./texts/AussieTop100private.zip

# Standard library imports
import glob, string
from pathlib import Path
from collections import Counter

# 3rd party imports
import nltk, nltk.sentiment
from nltk.corpus import wordnet as wn
from nltk.sentiment.util import mark_negation
import pandas as pd

# Downloads nltk corpora for preprocessing tasks
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Creates path variables to texts
about_dir = Path.cwd() / "texts" / "About"
pr_dir = Path.cwd() / "texts" / "PR"
dirs_to_load = [about_dir, pr_dir]

In [None]:
# Load texts

# Load texts
texts = [
    {'text_type': 'news', 'text_id': '1', 'text': 'This is a news article about a company.'},
    {'text_type': 'news', 'text_id': '2', 'text': 'This is another news article about a company.'},
    {'text_type': 'news', 'text_id': '3', 'text': 'This is a third news article about a company.'}
]


for directory in dirs_to_load:
  for file in glob.glob(f"{directory}/*.txt"):
    with open(file, 'r') as infile:
      text_type = file.split("/")[-2]
      text_id = file.split("/")[-1]
      texts.append({'text_type': text_type, 'text_id': text_id, 'text': infile.read()})

# Text Preprocessing Pipeline
for id, article in enumerate(texts):
  if id == 0:
    print("---Original--")
    print(article['text'][0:367])

  # Segmentation
  article['text'] = nltk.tokenize.sent_tokenize(article['text'])
  if id == 0:
    print("\n---After segmentation--")
    print(article['text'][0:2])

  # Tokenization
  article['text'] = [nltk.tokenize.word_tokenize(sentence) for sentence in article['text']]
  if id == 0:
    print("\n---After tokenization--")
    print(article['text'][0:2])

  # Case conversion
  for sent_id, sentence in enumerate(article['text']):
    article['text'][sent_id] = [word.lower() for word in sentence]
  if id == 0:
    print("\n---After case conversion--")
    print(article['text'][0:2])

  # Non-word character removal
  table = str.maketrans('', '', string.punctuation)
  for sent_id, sentence in enumerate(article['text']):
    article['text'][sent_id] = [word.translate(table) for word in sentence if word.translate(table) and not word.isdigit()]
  if id == 0:
    print("\n---After non-word character removal--")
    print(article['text'][0:2])

  # Token replacement
  translation_dict = {"'s": "is",
                      "n't": "not",
                      "IT": "Information Technology"}
  for sent_id, sentence in enumerate(article['text']):
    article['text'][sent_id] = [word if word not in translation_dict else translation_dict[word] for word in sentence]
    article['text'][sent_id] = nltk.sentiment.util.mark_negation(sentence)
  if id == 0:
    print("\n---After token replacement removal--")
    print(article['text'][0:2])

  # Stop word removal
  stop_words = nltk.corpus.stopwords.words("english")
  for sent_id, sentence in enumerate(article['text']):
    article['text'][sent_id] = [word for word in sentence if word not in stop_words]
  if id == 0:
    print("\n---After stop word removal--")
    print(article['text'][0:2])

  # Lemmatization
  pos_map = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
  lemmatizer = nltk.stem.WordNetLemmatizer()
  for sent_id, sentence in enumerate(article['text']):
    article['text'][sent_id] = [lemmatizer.lemmatize(a, pos_map.get(b[0], wn.NOUN)) for a, b in nltk.pos_tag(sentence)]
  if id == 0:
    print("\n---After lemmatization--")
    print(article['text'][0:2])

corpus_df = pd.DataFrame(texts)