<a href="https://colab.research.google.com/github/wzy0523/P7-Text-sentiment-analysis/blob/sean_001_preprocessing/group_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Google Drive authentification for loading datasets from gdrive 

In [75]:
# Code to read csv file into Colaboratory:
# !pip install -U -q PyDrive #<-run once
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from google.colab import drive
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

### Libraries Import

In [76]:
RANDOM_STATE = 42
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import numpy as np
import time as tm
import os

nltk.download('averaged_perceptron_tagger') #<- run once
nltk.download('omw-1.4') #<- run once
nltk.download('wordnet') #<- run once
nltk.download('punkt') #<- run once
nltk.download('stopwords') #<- run once

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Datasets

In [77]:
files = [
    ['1BAKoF0KapnjuITPWck1I133xNsMJktYe','news.csv'],
    ['1LbkNORZPmql02cSvnzFnaeH7hsEiwN5r', 'covid.csv'],
    ['1qRFsO3345fRrRgpt7MEmsshSU9YSjtWQ', 'email_tr.csv'],
    ['1JcjgjLKeZulHVDe2weCaHHV-Lex3vGGh', 'email_ts.csv'],
    ['1zGyIbieitVGolpUq65V4g7PI6dLlQ_wi', 'imdb.csv'],
    ['1YII5laqXiUtngbGsBnC4vnQA3GT36kgn', 'review.csv'],
    ['1CoCExkzRr9_fof_gxQGQ-GWLt2wvvLgv', 'twitter.csv']
    ]
for fl in files:
  downloaded = drive.CreateFile({'id':fl[0]}) 
  downloaded.GetContentFile(fl[1])
  tm.sleep(1)

In [78]:
news_txt = pd.read_csv('news.csv', header=None, encoding_errors='ignore')[1]

covid_txt = pd.read_csv('covid.csv')['Description']

email_txt = pd.concat([pd.read_csv('email_ts.csv')['email_body'], pd.read_csv('email_tr.csv')['email_body']], ignore_index=True).str.replace('\n', '')

imdb_txt = pd.read_csv('imdb.csv')['review'].str.replace('<br />','').sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)

review_txt = pd.read_csv('review.csv')['Review Text'].dropna().sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)

twitter_txt = pd.read_csv('twitter.csv', header=None, encoding_errors='ignore')[5].str.replace('(?:\@|https?\://)\S+', '').sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)

  # This is added back by InteractiveShellApp.init_path()


# Preprocessing


### preprocessing functions

In [79]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatization(text_c):
  lemmed = []
  for x in text_c:
    n = []
    for y in x:
      if y[1].startswith('J'):
        t = wordnet.ADJ
      elif y[1].startswith('V'):
        t = wordnet.VERB
      elif y[1].startswith('N'):
        t = wordnet.NOUN
      elif y[1].startswith('R'):
        t =  wordnet.ADV
      else:
        t = None
      n.append([y[0], t])

    usent = ''
    for z in n:
      if z[1] is None:
        u = lemmatizer.lemmatize(z[0])
      else:
        u = lemmatizer.lemmatize(z[0], pos = z[1])
      usent = usent + u + ' '
    lemmed.append(usent.strip())    
  return lemmed

def sw_removal(txt):
  m = [t for t in txt.split(' ') if t not in stop_words]
  fin = " ".join(m)
  return fin

def pre_process(txt_col):

  # lowercasing
  lwrd = txt_col.str.lower()

  # non-alphanumeric removal
  chrnum = lwrd.str.replace('[^0-9a-zA-Z/ ]', ' ')

  # other character removals
  rp1 = chrnum.str.replace(' +', ' ')
  rp2 = rp1.str.replace(' s ', ' ')
  rp3 = rp2.str.replace('http\S+', '')

  # tokenization and postagger
  tkn = rp2.apply(nltk.word_tokenize)
  postag = tkn.apply(nltk.pos_tag)

  # lemmatization
  lemm = lemmatization(postag)

  # stopword removal
  swr = [sw_removal(t) for t in lemm]
  
  return pd.DataFrame(list(zip(lemm, swr)), columns=['sw_include', 'sw_exclude'])


### Preprocessing

In [80]:
news = pre_process(news_txt)
covid = pre_process(covid_txt)
email = pre_process(email_txt)
imdb = pre_process(imdb_txt)
review = pre_process(review_txt)
twitter = pre_process(twitter_txt)



### Export to Files

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
wd = '/content/drive/MyDrive/UNI/COMPSCI 760/assignments/group project/scripts/output/'

news.to_csv(wd+'news_preprocess.csv', index=False)
covid.to_csv(wd+'covid_preprocess.csv', index=False)
email.to_csv(wd+'email_preprocess.csv', index=False)
imdb.to_csv(wd+'imdb_preprocess.csv', index=False)
review.to_csv(wd+'review_preprocess.csv', index=False)
twitter.to_csv(wd+'twitter_preprocess.csv', index=False)

# Meta Extraction <u>(not done)</u>



In [33]:
t = [len(m)+1 for m in twitter[0]]
pd.Series(l).describe()

count    10000.000000
mean        61.809200
std         33.295018
min          1.000000
25%         34.000000
50%         57.000000
75%         88.000000
max        145.000000
dtype: float64

In [34]:
e = [len(m)+1 for m in email[0]]
pd.Series(e).describe()

count     105.00000
mean      177.07619
std       200.34920
min        10.00000
25%        60.00000
50%        98.00000
75%       215.00000
max      1040.00000
dtype: float64

In [36]:
n = [len(m)+1 for m in news[0]]
pd.Series(n).describe()

count    4846.000000
mean      118.734007
std        52.423016
min         7.000000
25%        78.000000
50%       111.000000
75%       152.000000
max       287.000000
dtype: float64