# 1. XOR function

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive"

os.chdir(path)
os.listdir(path)

Mounted at /content/drive


['negative-words.txt',
 'positive-words.txt',
 'Europass',
 '无标题文档.gdoc',
 'task4-tweets20-en - task4-tweets20-en（副本）.gsheet',
 'task4-tweets20-en - task4-tweets20-en.gsheet',
 'task4-tweets26-5_Answer.gsheet',
 'test.csv',
 'train.csv',
 'train.gsheet',
 'Colab Notebooks',
 'train_translated.gsheet',
 '无标题电子表格.gsheet',
 'Anniversary.gsheet']

In [15]:
import pandas as pd

In [34]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


In [None]:
# prepare training data
X=[
   [0,0],
   [0,1],
   [1,0],
   [1,1]
]
y=[-1,1,1,-1]

# convert X,y to np array
X=np.array(X)
y=np.array(y)

In [None]:
# define neural network
mlp=MLPClassifier(random_state=1,max_iter=100000, hidden_layer_sizes=(5,5),
                 activation='relu', alpha=0.0001, 
                 solver='lbfgs',verbose=True)
# set output node to use sigmoid activation function
mlp.out_activation_='logistic'

In [None]:
# Train the model
trained_model=mlp.fit(X,y)


In [None]:
# Test the model
test=[[0,1],[0,0],[1,1],[1,0]]
trained_model.predict(np.array(test))

array([-1, -1, -1,  1])

In [None]:
# weights
for w in trained_model.coefs_:
  print(w.shape)

(2, 5)
(5, 5)
(5, 1)


In [None]:
# bias
trained_model.intercepts_

[array([-0.14962269, -2.97043085, -0.5472481 ,  1.36215335, -0.87510813]),
 array([ 1.88523531, -0.82865212, -0.80904768, -0.03082989, -0.76999685]),
 array([-7.0403055])]

# 2 Text classifcation

## 2.1 Sentiment lexicon-based classifier

In [None]:
from collections import defaultdict

In [25]:
import re
import nltk # just use a corpus named stopwords to optimize built-in tokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [29]:

def reader():
  root_dir='Ethic_in_NLP/DATA/review_polarity/txt_sentoken'

  neg_reviews=read(root_dir,'neg')
  pos_reviews=read(root_dir,'pos')
  return neg_reviews,pos_reviews


def read(path, category):
  """
  """
  reviews=[]
  root_dir=path+'/'+category
  for file in os.listdir(root_dir):
    file_name=root_dir+'/'+file
    file_stream=open(file_name,'r')
    review=''
    for line in file_stream:
      line=line.strip('\n')
      review+=line
    text_noPunc=re.sub("[^a-zA-Z#]"," ", review)
    words=text_noPunc.lower().split() # cast word to lower and split them by space
    reviews.append(words)
    file_stream.close()
  
  return reviews
  




1000 1000


In [71]:
class Lexicon_Classifier():
  def __init__(self) -> None:
    self.posDict,self.negDict=self.loadData()

  
  def loadData(self):
    pos_path='positive-words.txt'
    neg_path='negative-words.txt'


    poswords=[]
    negwords=[]
    with open(file=pos_path) as f:
      for line in f:
        if not line.startswith(';'):
          poswords.append(line)
    
    with open(file=neg_path) as f:
      for line in f:
        if not line.startswith(';'):
          negwords.append(line)



    pos_dict=defaultdict(int)
    neg_dict=defaultdict(int)
    for item in poswords:
      item=item.strip('\n')
      if not item.isspace():
        pos_dict[item]+=1
    for item in negwords:
      item=item.strip('\n')
      if not item.isspace():
        neg_dict[item]+=1

    return pos_dict,neg_dict

  
  def predict(self, X):
    """
    """
    predicts=[]
    for item in X:
      pred_label=self.classify(item)
      predicts.append(pred_label)
    
    return predicts


  def classify(self, sent):
    """
    :sent: a tokenized sentence from review
    """
    # number of postive words, negative words
    pos_num=0
    neg_num=0
    for word in sent:
      if self.posDict.get(word)!=None:
        pos_num+=1
      if self.negDict.get(word)!=None:
        neg_num+=1
    
    # print(pos_num,neg_num)
    if pos_num>neg_num:
      return 1
    elif pos_num<neg_num:
      return -1
    else:
      return 0

  def evaluate(self, pred_y, gold_y):
    f1=f1_score(gold_y,pred_y,average='macro')
    ac=accuracy_score(gold_y, pred_y)

    print("The accuracy is {0}. The f1-score(average='macro') is {1}".format(ac,f1))



    
    

  def main_task_1_2_1():  
    neg_reviews,pos_reviews=reader()
    lc=Lexicon_Classifier()

    predicts=lc.predict(neg_reviews)

    gold_set_np=np.ones(shape=(len(neg_reviews),1))*(-1)
    gold=gold_set_np.tolist()
    lc.evaluate(predicts,gold)



main_task_1_2_1()








The accuracy is 0.719. The f1-score(average='macro') is 0.2788442893154935
