<a href="https://colab.research.google.com/github/wzy0523/P7-Text-sentiment-analysis/blob/sean_001_preprocessing/group_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Google Drive authentification for loading datasets from gdrive 

In [10]:
# Code to read csv file into Colaboratory:
# !pip install -U -q PyDrive #<-run once
# !pip install -U -q textstat #<-run once
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from google.colab import drive
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

### Libraries Import

In [11]:
RANDOM_STATE = 42
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import numpy as np
import time as tm
import os
import re

nltk.download('averaged_perceptron_tagger') #<- run once
nltk.download('omw-1.4') #<- run once
nltk.download('wordnet') #<- run once
nltk.download('punkt') #<- run once
nltk.download('stopwords') #<- run once

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Datasets

In [12]:
files = [
    ['1BAKoF0KapnjuITPWck1I133xNsMJktYe','news.csv'],
    ['1LbkNORZPmql02cSvnzFnaeH7hsEiwN5r', 'covid.csv'],
    ['1qRFsO3345fRrRgpt7MEmsshSU9YSjtWQ', 'email_tr.csv'],
    ['1SEQF7-xtk2MvJQfYTHZULFOa-ln4rj10', 'email.csv'],
    ['1zGyIbieitVGolpUq65V4g7PI6dLlQ_wi', 'imdb.csv'],
    ['1YII5laqXiUtngbGsBnC4vnQA3GT36kgn', 'review.csv'],
    ['1CoCExkzRr9_fof_gxQGQ-GWLt2wvvLgv', 'twitter.csv']
    ]
for fl in files:
  downloaded = drive.CreateFile({'id':fl[0]}) 
  downloaded.GetContentFile(fl[1])
  tm.sleep(1)

In [13]:
news_txt = pd.read_csv('news.csv', 
                       header=None, 
                       encoding_errors='ignore',
                       names=['sent', 'text']
                       )[['text','sent']]
news_txt['sent'] = np.select([(news_txt['sent'] == 'positive'),(news_txt['sent'] == 'negative')], [1,-1], default=0)

# covid dataset doesn't have neutral sentiment
covid_txt = pd.read_csv('covid.csv')[['Description', 'Sentiment']].rename(columns={'Description': 'text', 'Sentiment': 'sent'})
covid_txt['sent'] = covid_txt['sent'].replace(0,-1)


email1 = pd.read_csv('email_tr.csv')[['email_body','sentiment']].rename(columns={'email_body': 'text', 'sentiment': 'sent'})
email1['sent'] = np.select([(email1['sent'] > 3), (email1['sent'] < 3)], [1,-1], default=0)
email2 = pd.read_csv('email.csv', sep=";")[['email_body','label']].rename(columns={'email_body': 'text', 'label': 'sent'})
email_txt = pd.concat([email1,email2],ignore_index=True)
email_txt['text'] = email_txt['text'].replace('\\n', '')


# imdb dataset doesn't have neutral sentiment
imdb_txt = pd.read_csv('imdb.csv').rename(columns={'review': 'text', 'sentiment': 'sent'}).sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)
imdb_txt['text'] = imdb_txt['text'].str.replace('<br />','')
imdb_txt['sent'] = np.select([(imdb_txt['sent'] == 'positive'),(imdb_txt['sent'] == 'negative')], [1,-1], default=0)

twitter_txt = pd.read_csv('twitter.csv', header=None, encoding_errors='ignore')[[5,0]].rename(columns={5: 'text', 0: 'sent'}).sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)
twitter_txt['sent'] = np.select([(twitter_txt['sent'] == 4),(twitter_txt['sent'] == 2)], [1,-1], default=0)

# for clothes review, i use the rating: 1,2=neg, 3=neutral, 4,5=pos
review_txt = pd.read_csv('review.csv')[['Review Text', 'Rating']].rename(columns={'Review Text': 'text', 'Rating': 'sent'}).dropna().sample(n=10000, random_state=RANDOM_STATE, ignore_index=True)
review_txt['sent'] = np.select([(review_txt['sent'] > 3),(review_txt['sent'] < 3)], [1,-1], default=0)

# Preprocessing


### preprocessing functions

In [14]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatization(text_c):
  lemmed = []
  for x in text_c:
    n = []
    for y in x:
      if y[1].startswith('J'):
        t = wordnet.ADJ
      elif y[1].startswith('V'):
        t = wordnet.VERB
      elif y[1].startswith('N'):
        t = wordnet.NOUN
      elif y[1].startswith('R'):
        t =  wordnet.ADV
      else:
        t = None
      n.append([y[0], t])

    usent = ''
    for z in n:
      if z[1] is None:
        u = lemmatizer.lemmatize(z[0])
      else:
        u = lemmatizer.lemmatize(z[0], pos = z[1])
      usent = usent + u + ' '
    lemmed.append(usent.strip())    
  return lemmed

def sw_removal(txt):
  m = [t for t in txt.split(' ') if t not in stop_words]
  fin = " ".join(m)
  return fin

def pre_process(txt_df):
  
  txt_col = txt_df['text']
  snt = list(txt_df['sent'])

  # lowercasing
  lwrd = txt_col.str.lower()

  # non-alphanumeric removal
  chrnum = lwrd.str.replace('[^0-9a-zA-Z/ ]', ' ')

  # other character removals
  rp1 = chrnum.str.replace(' +', ' ')
  rp2 = rp1.str.replace(' s ', ' ')
  rp3 = rp2.str.replace('(?:\@|https?\://)\S+', '')

  # tokenization and postagger
  tkn = rp2.apply(nltk.word_tokenize)
  postag = tkn.apply(nltk.pos_tag)

  # lemmatization
  lemm = lemmatization(postag)

  # stopword removal
  swr = [sw_removal(t) for t in lemm]
  
  return pd.DataFrame(list(zip(list(txt_col), lemm, swr, snt)), columns=['ori_text', 'sw_include', 'sw_exclude', 'sentiment'])


### Preprocessing

In [49]:
news = pre_process(news_txt)
covid = pre_process(covid_txt)
email = pre_process(email_txt)
imdb = pre_process(imdb_txt)
review = pre_process(review_txt)
twitter = pre_process(twitter_txt)



### Export to Files

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
wd = '/content/drive/MyDrive/UNI/COMPSCI 760/assignments/group project/scripts/output/'

news.to_csv(wd+'news_preprocess.csv', index=False)
covid.to_csv(wd+'covid_preprocess.csv', index=False)
email.to_csv(wd+'email_preprocess.csv', index=False)
imdb.to_csv(wd+'imdb_preprocess.csv', index=False)
review.to_csv(wd+'review_preprocess.csv', index=False)
twitter.to_csv(wd+'twitter_preprocess.csv', index=False)

# Meta Extraction

In [50]:
# docs = [news, covid, email, imdb, review, twitter]

# general meta features extraction
def general_meta(df):
  
  # length
  lgt = [len(x) + 1 for x in df['sw_include'].str.split(' ')]

  # number of alpha-numeric characters
  anum_chr = [len(y) for y in [re.findall('[0-9a-zA-Z/ ]', x) for x in df['ori_text']]]

  # number of non-alpha-numeric characters
  nanum_chr = [len(y) for y in [re.findall('[^0-9a-zA-Z/ ]', x) for x in df['ori_text']]]

  # combine
  new_df = df.copy()
  new_df[['length', 'anum_char', 'nanum_char']] = pd.DataFrame(list(zip(lgt, anum_chr, nanum_chr)))

  return new_df

def lex_meta(df):
  dic = {
    'adjective' : [],
    'adposition' : [],
    'adverb' : [],
    'conjunction' : [],
    'determiner' : [],
    'noun' : [],
    'numeral' : [],
    'particle' : [],
    'pronoun' : [],
    'verb' : [],
    # 'punctuation' : [],
    'other' : []
  }

  for t in df['ori_text'].str.lower():
    adjective = 0
    adposition = 0
    adverb = 0
    conjunction = 0
    determiner = 0
    noun = 0
    numeral = 0
    particle = 0
    pronoun = 0
    verb = 0
    # punctuation = 0
    other =  0
    for tag in nltk.pos_tag(nltk.word_tokenize(t)):
      tt = tag[1]
      if tt.startswith('JJ'):
        adjective+=1
      elif tt.startswith('RB'):
        adverb+=1
      elif tt.startswith('NN'):
        noun+=1
      elif tt.startswith('PRP'):
        pronoun+=1
      elif tt.startswith('VB'):
        verb+=1
      elif tt == 'IN':
        adposition+=1
      elif tt == 'CC':
        conjunction+=1
      elif tt == 'DT':
        determiner+=1
      elif tt == 'CD':
        numeral+=1
      elif tt == 'RP':
        particle+=1
      # elif tt == '!':
      #   punctuation+=1
      else:
        other+=1

    dic['adjective'].append(adjective)
    dic['adposition'].append(adposition)
    dic['adverb'].append(adverb)
    dic['conjunction'].append(conjunction)
    dic['determiner'].append(determiner)
    dic['noun'].append(noun)
    dic['numeral'].append(numeral)
    dic['particle'].append(particle)
    dic['pronoun'].append(pronoun)
    dic['verb'].append(verb)
    # dic['punctuation'].append(punctuation)
    dic['other'].append(other)

  new_df = pd.concat([df, pd.DataFrame(dic)], axis=1)
  return new_df

In [51]:
news_general_df = general_meta(news)
covid_general_df = general_meta(covid)
email_general_df = general_meta(email)
imdb_general_df = general_meta(imdb)
review_general_df = general_meta(review)
twitter_general_df = general_meta(twitter)

In [54]:
news_lex_df = lex_meta(news)
covid_lex_df = lex_meta(covid)
email_lex_df = lex_meta(email)
imdb_lex_df = lex_meta(imdb)
review_lex_df = lex_meta(review)
twitter_lex_df = lex_meta(twitter)