# 1. XOR function

In [2]:
import pandas as pd
import os

In [3]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [4]:
# Import necessary libraries
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


In [5]:
# prepare training data
X=[
   [0,0],
   [0,1],
   [1,0],
   [1,1]
]
y=[-1,1,1,-1]

# convert X,y to np array
X=np.array(X)
y=np.array(y)

In [6]:
# define neural network
mlp=MLPClassifier(random_state=1,max_iter=100000, hidden_layer_sizes=(5,5),
                 activation='relu', alpha=0.0001, 
                 solver='lbfgs',verbose=True)
# set output node to use sigmoid activation function
mlp.out_activation_='logistic'

In [7]:
# Train the model
trained_model=mlp.fit(X,y)


In [8]:
# Test the model
test=[[0,1],[0,0],[1,1],[1,0]]
trained_model.predict(np.array(test))

array([-1, -1, -1,  1])

In [9]:
# weights
for w in trained_model.coefs_:
  print(w.shape)

(2, 5)
(5, 5)
(5, 1)


In [10]:
# bias
trained_model.intercepts_

[array([-0.14962269, -1.54165669, -0.5472481 ,  1.69767268, -0.87510813]),
 array([ 3.58527483, -0.52189389, -0.80904768, -0.03082989, -0.76999685]),
 array([-13.17581923])]

# 2 Text classifcation

## 2.1 Sentiment lexicon-based classifier

In [11]:
from collections import defaultdict
import re
import nltk # just use a corpus named stopwords to optimize built-in tokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import *

In [76]:

def reader():
  root_dir='DATA/review_polarity/txt_sentoken'

  neg_reviews=read(root_dir,'neg')
  pos_reviews=read(root_dir,'pos')
  return neg_reviews,pos_reviews


def read(path, category):
  """
  load data from file, and combine sentences from the same file to one string without punctuation
  """
  reviews=[]

  # phase file path
  root_dir=path+'/'+category
  for file in os.listdir(root_dir):
    file_name=root_dir+'/'+file
    file_stream=open(file_name,'r')
    review=''

    # combine sentences
    for line in file_stream:
      line=line.strip('\n')
      review+=line

    # remove punctuation
    text_noPunc=re.sub("[^a-zA-Z#]"," ", review)

    # cast alphabets to lower and split the string combining sentence
    words=text_noPunc.lower().split() # cast word to lower and split them by space
    reviews.append(words)
    file_stream.close()
  
  return reviews
  



class Lexicon_Classifier():
  """
  classifier positive and negative by comparing the number of the positive words and negative words occurred in the given lexicon.

  Attribute:
  - posDict: A dictionary consisting of the positive word from the give lexicon
  - negDict: A dictionary consisting of the negative word from the give lexicon


  """
  def __init__(self) -> None:
    self.posDict,self.negDict=self.loadData()


  def loadData(self):
    pos_path='DATA/opinion_lexicon_English/positive-words.txt'
    neg_path='DATA/opinion_lexicon_English/negative-words.txt'


    poswords=[]
    negwords=[]
    with open(file=pos_path) as f:
      for line in f:
        if not line.startswith(';'):
          poswords.append(line)

    with open(file=neg_path) as f:
      for line in f:
        if not line.startswith(';'):
          negwords.append(line)

    pos_dict=defaultdict(int)
    neg_dict=defaultdict(int)
    for item in poswords:
      item=item.strip('\n')
      if not item.isspace():
        pos_dict[item]+=1
    for item in negwords:
      item=item.strip('\n')
      if not item.isspace():
        neg_dict[item]+=1

    return pos_dict,neg_dict


  def predict(self, X):
    """
    """
    predicts=[]
    for item in X:
      pred_label=self.classify(item)
      predicts.append(pred_label)

    return predicts


  def classify(self, sent):
    """
    :sent: a tokenized sentence from review
    """
    # number of positive words, negative words
    pos_num=0
    neg_num=0

    # count the number of pos/neg words in a review
    for word in sent:
      if self.posDict.get(word)!=None:
        pos_num+=1
      if self.negDict.get(word)!=None:
        neg_num+=1

    # print(pos_num,neg_num)
    if pos_num>neg_num:
      return 1
    elif pos_num<neg_num:
      return -1
    else:
      return 0

  def evaluate(self, pred_y, gold_y):
    f1=f1_score(gold_y,pred_y,average='macro')
    ac=accuracy_score(gold_y, pred_y)

    print("The accuracy is {0}.\nThe f1-score(average='macro') is {1}".format(ac,f1))



def main_task_1_2_1():
  neg_reviews,pos_reviews=reader()
  lc=Lexicon_Classifier()

  predicts=lc.predict(neg_reviews)

  # generate gold labels which actually are all -1 in this case
  gold_set_np=np.ones(shape=(len(neg_reviews),1))*(-1)
  gold=gold_set_np.tolist()
  lc.evaluate(predicts,gold)


main_task_1_2_1()

The accuracy is 0.719.
The f1-score(average='macro') is 0.2788442893154935


## 2.2 regressor classifier

In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import *
from nltk.probability import FreqDist

[nltk_data] Downloading package stopwords to C:\Users\Marco
[nltk_data]     Yu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Marco
[nltk_data]     Yu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marco Yu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
import pandas as pd

In [17]:
from nltk.stem.wordnet import WordNetLemmatizer
def reader():
  root_dir='DATA/review_polarity/txt_sentoken'

  neg_reviews=read(root_dir,'neg')
  pos_reviews=read(root_dir,'pos')
  return neg_reviews,pos_reviews


def read(path, category):
  """


  """
  # initialize a lemmatizer to lemmatize word
  lemmatizer=WordNetLemmatizer()
  stemmer=PorterStemmer()
  tag=5
  reviews=[]
  root_dir=path+'/'+category
  for file in os.listdir(root_dir):
    file_name=root_dir+'/'+file
    file_stream=open(file_name,'r')
    review=''
    for line in file_stream:
      line=line.strip('\n')
      review+=line

    # Text pre-processing 


    # remove punctation and cast all aphabets to lower one
    text_noPunc=re.sub("[^a-zA-Z#]"," ", review)
    words=text_noPunc.lower().split() 
    


    # normalization
    stop_words=stopwords.words('english')
    words=[stemmer.stem(lemmatizer.lemmatize(word)) for word in words 
           if word not in stop_words and len(word)>2 ]

    reviews.append(words)
    file_stream.close()
  
  return reviews


In [96]:
%time

class Vectorizer():
  """

  """

  def __init__(self,vocabulary) -> None:
    self.vocab=vocabulary


  def loadData(self):
    pos_path='DATA/opinion_lexicon_English/positive-words.txt'
    neg_path='DATA/opinion_lexicon_English/negative-words.txt'


    poswords=[]
    negwords=[]
    with open(file=pos_path) as f:
      for line in f:
        if not line.startswith(';'):
          poswords.append(line)

    with open(file=neg_path) as f:
      for line in f:
        if not line.startswith(';'):
          negwords.append(line)

    pos_dict=defaultdict(int)
    neg_dict=defaultdict(int)
    for item in poswords:
      item=item.strip('\n')
      if not item.isspace():
        pos_dict[item]+=1
    for item in negwords:
      item=item.strip('\n')
      if not item.isspace():
        neg_dict[item]+=1

    return pos_dict,neg_dict

  
  def vectorize(self,sentences):
    """
    for 2k sampled reviews, traning data costs 7 min at least 
    on colab with the standard virtual hardware

    """
    # initialize BOW matrix
    vectors_df=pd.DataFrame(data=np.zeros((1,len(self.vocab))).tolist(), columns=self.vocab, dtype=int)

    # count words and update BOW matrix
    for index, review in enumerate(sentences):
      vectors_df.loc[index]=0
      fq=nltk.FreqDist(review)
      for key in fq.keys():
        vectors_df.loc[index,key]+=fq.get(key)

    # reduce dimensions of the BOW matrix according word frequency
    fq_list=[]
    # calculate how many times the word occurs over the whole texts
    for key in vectors_df.keys():
      fq_list.append(vectors_df[key].sum())

    # sorting word frequency
    feature_fq=list(nltk.FreqDist(fq_list).keys())
    feature_fq.sort()

    # drop features that rarely present in texts
    drop_list=[key for key in feature_fq[:10]]
    keys=[]
    for key in vectors_df.keys():
      if vectors_df[key].sum() in drop_list:
        keys.append(key)

    vectors_df_reduced=vectors_df.drop(columns=keys)


    self.vocab=vectors_df_reduced
    os.makedirs('model/', exist_ok=True)
    vectors_df_reduced.to_csv('model/BOW_Model.csv')
    return vectors_df_reduced

def main_1_2_2():
  pos_set, neg_set = reader()

  # generate word vocabulary from all given texts
  vocab=[]
  for cate in (pos_set, neg_set):
    for sent in cate:
      for word in sent:
        vocab.append(word)

  total_reviews=pos_set+neg_set
  vc=Vectorizer(set(vocab))

  # if you run the program at the first time, you should vectorize the text by running "vc.vectorize()"
  # Otherwise, you just annotate this line of code, and remove the annotation symbol of the line "pd.read_csv()"
  # to read trained model from the file generated after the program running the first time in order to save time
  vv_df=vc.vectorize(total_reviews)
  # vv_df=pd.read_csv('Ethic_in_NLP/model/BOW_Model.csv')

  # generate gold labels for the reviews
  label_pos=np.ones(shape=(len(pos_set)))
  label_neg=np.zeros(shape=(len(neg_set)))
  y=np.concatenate((label_pos,label_neg))

  X=vv_df.to_numpy()

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

  # LR Classifier
  logistic_regressor=LogisticRegression(random_state=0,max_iter=1000,C=0.01)

  # training data
  trained_model=logistic_regressor.fit(X_train, y_train)

  # predict for test data
  predictions=trained_model.predict(X_test)

  # evaluation
  print(classification_report(y_test,predictions))

main_1_2_2()  # Word vectorization need almost 5 minutes.


Wall time: 0 ns
              precision    recall  f1-score   support

         0.0       0.82      0.82      0.82       301
         1.0       0.82      0.82      0.82       299

    accuracy                           0.82       600
   macro avg       0.82      0.82      0.82       600
weighted avg       0.82      0.82      0.82       600

