In [8]:
# import the package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import seaborn as sns
# import packages
from sklearn.linear_model import LogisticRegression
# from spellchecker import SpellChecker
from collections import Counter, OrderedDict
from collections import defaultdict
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
# import en_core_web_lg
import en_core_web_sm
nlp = en_core_web_sm.load()
import spacy 
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
from tracemalloc import stop
from langcodes import best_match
from matplotlib import rc
import matplotlib.pyplot as plt
import tomotopy as tp
from gensim.models import Phrases
# nlp = spacy.load("en_core_web_lg")
import pickle # for spelling chcker
from rich.console import Console
from rich.table import Table

# BERT
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
import random
# from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup 
# import nlpaug.augmenter.sentence as nas
# import nlpaug.augmenter.char as nac
# import nlpaug.augmenter.word as naw

# 1. Overlooking the data

In [9]:
# load the data
df = pd.read_csv('./NLP_dataset/train.csv')
df_test = pd.read_csv('./NLP_dataset/test.csv')
df_submission = pd.read_csv('./NLP_dataset/submission_file.csv')

# 2. Data Preprocessing

## 2.1. Cleaning the text

In [10]:
def preprocess(text):
  pattern = re.compile(r'\s+')
  text=text.replace('\u202f',' ').replace('xa0',' ')
  text=re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
  text=re.sub(r"\\u00b4",r'',text)
  text=re.sub(pattern, ' ', text)
  text = text.replace('?', ' ? ').replace(')', ') ').strip()
  return text

df['imdb_user_review']=df['imdb_user_review'].apply(preprocess)
df.head()

Unnamed: 0,helpfulness_cat,imdb_user_review
0,1.0,It is hard to find such delightful and adorabl...
1,1.0,They dont make films like this faded haunting ...
2,1.0,I first viewed this movie in 1924 at age 6 yrs...
3,1.0,I doubt that Id ever seen anything resembling ...
4,1.0,I was shocked to find myself riveted to this m...


In [14]:
from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup 

In [15]:
# def tokenize(text):
#     return [str(token) for token in nlp(text) 
#         if not token.is_stop 
#         and not token.like_num
#         and not token.is_punct
#         and token.is_alpha
#         ]
# # from nltk.tokenize import TreebankWordTokenizer
# # voc = {}
# # tkns_= doc2bow(TreebankWordTokenizer().tokenize(df_train_clean['imbd_user_review'][0]), voc_=voc)
tokenizer=BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True) # case and uncase 
# input_test=tokenizer(df['imdb_user_review'][0],padding='max_length',max_length=512,truncation=True,return_tensors='pt') # max_length 最大是512 # truncation 後面的不要了 # pt and tf 可以選
# input_test
def tokenize(text):
  output=tokenizer(text,padding='max_length',max_length=512,truncation=True,return_tensors='pt')
  return output

In [16]:
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

In [17]:
def aug_abstract(text):
  aug = nas.abst_summ.AbstSummAug(model_path='t5-base')
  augmented_text=aug.augment(text) # augmented_text = summary
  return augmented_text

def aug_crop(text):
  aug1=naw.RandomWordAug(action='crop',aug_p=0.5) # 隨機山句子
  aug2=naw.SynonymAug(aug_src='wordnet') # 同義詞替換
  aug3 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") # 根據上下文替換句子
  return aug3.augment(aug2.augment(aug1.augment(text))) 

In [None]:
df_aug=df[df['helpfulness_cat']==0].copy()
df_aug=df_aug.reset_index(drop=True)

df_aug['augmented_text1']=df_aug['imdb_user_review'].apply(aug_abstract)
# df_aug['augmented_text2']=df_aug['imdb_user_review'].apply(aug_crop)

# os.chdir('/content/drive/My Drive')
df_aug
# df_aug.to_csv('df_aug.csv')

In [12]:
class ReviewDataset(Dataset):
  def __init__(self,labels,texts):
    self.labels=labels
    self.texts=texts
    self.tokens=self.texts.apply(tokenize)
  
  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx): # 一般如果想使用索引访问元素时，就可以在类中定义这个方法
    token=self.tokens.iloc[idx]['input_ids'].squeeze(0)
    attention_mask=self.tokens.iloc[idx]['attention_mask'].squeeze(0)
    label=self.labels.iloc[idx]
    return token,attention_mask,label

In [13]:
train_df,test_df=train_test_split(df,test_size=0.2,random_state=42)

train_dataset=ReviewDataset(train_df['helpfulness_cat'],train_df['imdb_user_review'])
test_dataset=ReviewDataset(test_df['helpfulness_cat'],test_df['imdb_user_review'])

batch_size=2
train_loader=DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,num_workers=2)
test_loader=DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=False,num_workers=2)

## 2.2. Data Balancing (0 & 1)

## 2.3. Changing into Vector

## 2.4. Feature Engineering

# 3. Benchmarking Model (with only text vector input)

## 3.1. Logistic Regression

## 3.2. Selecting the best vector accuracy

# 4. Other models & Adding features

## 4.1. Bagging, Boosting and Stacking

# 5. Embedding algorithms + LSTM/GRU 

# 6. BERT

In [2]:
import torch
import math

In [3]:
# initialize the console
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
# torch.cuda.is_available() # use GPU if available (m1 will use the cpu)
print(torch.backends.mps.is_built())
print(torch.backends.mps.is_available())
print(torch.__version__)

True
True
1.13.0.dev20220713


In [1]:
class Bertclassifier(nn.Module):
  def __init__(self,dropout=0.5):
    super().__init__()
    self.Bertclassifier=BertModel.from_pretrained('bert-large-cased')
    self.dropout=nn.Dropout(dropout)
    self.linear1=nn.Linear(1024,256)
    self.linear2=nn.Linear(256,2)
    self.relu=nn.ReLU()

  def forward(self,input_ids,attention_mask):
    _,pooled_output=self.Bertclassifier(input_ids=input_ids,attention_mask=attention_mask,return_dict=False)
    linear_output=self.relu(self.linear1(pooled_output))
    return self.linear2(linear_output)

model=Bertclassifier().to(device)

NameError: name 'nn' is not defined

In [None]:
model(next(iter(train_loader))[0].to(device),next(iter(train_loader))[1].to(device))

In [None]:
class FocalLoss(nn.Module):
    
    def __init__(self, weight=None, 
                 gamma=2, reduction='mean'):
        nn.Module.__init__(self)
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, input_tensor, target_tensor):
        log_prob = F.log_softmax(input_tensor, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            target_tensor, 
            weight=self.weight,
            reduction = self.reduction
        )

# 7. Comparsion and Conclusion

# 8. submission