# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- lime 0.2.0.1
- spacy 3.5.1
- gcsfs 2023.3.0
- protobuf 3.20.3


In [1]:
from nltk.corpus import product_reviews_1
camera_reviews = product_reviews_1.reviews('Canon_G3.txt')

reviews = []

for review in camera_reviews:
    sentences = []
    for sentence in review.sents():
        text = " ".join(sentence)
        sentences.append(text)
    document = " ".join(sentences)
    reviews.append(document)


In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens
    
    text = ' '.join(words)
    
    return text

In [3]:
corpus = []

for review in reviews:
    document = tokenize(review)
    corpus.append(document)


In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer() # compound in [-1,1]

Ver documentación de vader en: https://www.nltk.org/api/nltk.sentiment.vader.html

In [5]:
label = []
for i in range(len(corpus)):
    if analyzer.polarity_scores(corpus[i])['compound'] > 0.2: 
        label.append('Positive') # positive sentiment
    elif analyzer.polarity_scores(corpus[i])['compound'] < -0.2:
        label.append('Negative') # negative sentiment
    else:
        label.append('Neutral') # neutral sentiment

In [6]:
import pandas as pd

df = pd.DataFrame(corpus, columns = ['review'])
df['polarities'] = label
df

Unnamed: 0,review,polarities
0,recently purchased canon powershot extremely s...,Positive
1,yep first digital camera toy software engineer...,Positive
2,extensive research comparing different megapix...,Positive
3,bought canon month ago say satisfied taken hun...,Positive
4,camera one full day say wonderful photo qualit...,Positive
5,positive slr like programming exposure control...,Positive
6,camera wonderful set feature lcd screen pull r...,Positive
7,recent price drop made best bargain digital ca...,Positive
8,recommend unreservedly powershot potential buy...,Positive
9,else say camera work make photograph work want...,Positive


# Supervised sentiment analysis (training)

In [7]:
# load in dataset and separate by the __label__ classifier in the text file
data = pd.read_csv('gs://nlp_amazon_data/train.ft.txt', sep="__label__", header = None)

  data = pd.read_csv('gs://nlp_amazon_data/train.ft.txt', sep="__label__", header = None)


In [8]:
data.drop(0, inplace=True, axis=1)
data['sentiment'] = data[1].str[0]
data[1] = data[1].str[2:]
data = data.rename(columns={1: 'review'})
data

Unnamed: 0,review,sentiment
0,Stuning even for the non-gamer: This sound tra...,2
1,The best soundtrack ever to anything.: I'm rea...,2
2,Amazing!: This soundtrack is my favorite music...,2
3,Excellent Soundtrack: I truly like this soundt...,2
4,"Remember, Pull Your Jaw Off The Floor After He...",2
...,...,...
3599995,Don't do it!!: The high chair looks great when...,1
3599996,"Looks nice, low functionality: I have used thi...",1
3599997,"compact, but hard to clean: We have a small ho...",1
3599998,what is it saying?: not sure what this book is...,1


In [9]:
sample = data.sample(n=100000)

In [10]:
import string
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.blank("en") # Create a blank pipeline of a given language class
REGX_USERNAME = r"@[A-Za-z0-9$-_@.&+]+"

def preprocessing(text):
  text = text.lower()
  text = re.sub(REGX_USERNAME, ' ', text)
  tokens = [token.text for token in nlp(text)]
  tokens = [t for t in tokens if t not in STOP_WORDS and t not in string.punctuation and len(t) > 2]
  tokens = [t for t in tokens if not t.isdigit()]

  return " ".join(tokens)

sample["text_clean"] = sample["review"].apply(preprocessing)
sample.head()

Unnamed: 0,review,sentiment,text_clean
3346284,"Great cookbook.: I recently lost my husband, b...",2,great cookbook recently lost husband loved coo...
2836622,perfect for occasional use on shower glass & w...,2,perfect occasional use shower glass windshield...
36930,a good buy for this price: I bought this car s...,2,good buy price bought car seat backup car good...
2526701,"As always, Céline and her incredible talent!: ...",2,céline incredible talent céline english record...
261667,"flawed: Generally a very good resource, we sta...",1,flawed generally good resource stayed small lo...


In [11]:
sample

Unnamed: 0,review,sentiment,text_clean
3346284,"Great cookbook.: I recently lost my husband, b...",2,great cookbook recently lost husband loved coo...
2836622,perfect for occasional use on shower glass & w...,2,perfect occasional use shower glass windshield...
36930,a good buy for this price: I bought this car s...,2,good buy price bought car seat backup car good...
2526701,"As always, Céline and her incredible talent!: ...",2,céline incredible talent céline english record...
261667,"flawed: Generally a very good resource, we sta...",1,flawed generally good resource stayed small lo...
...,...,...,...
2172193,Exactly what I wanted: The bumper caps fit as ...,2,exactly wanted bumper caps fit expected better...
346447,"Eloquent discriptions, Savage vision, Fantasti...",2,eloquent discriptions savage vision fantastic ...
515569,looks good to me: I hope I never have to use i...,2,looks good hope use looks like quality piece h...
2101032,Disappointed: I am disappointed with this blow...,1,disappointed disappointed blower purchased blo...


In [12]:
dataset = list(sample[["text_clean", "sentiment"]].sample(frac=1).itertuples(index=False, name=None))
train_data = dataset[:75000]  # 75%
dev_data = dataset[75000:90000] # 15%
test_data = dataset[90000:] # 10%

In [13]:
def convert(data, outfile):
    db = spacy.tokens.DocBin()
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        doc.cats["POS"] = label == '2'
        doc.cats["NEG"] = label == '1'
        db.add(doc)
    
    db.to_disk(outfile)
convert(train_data, "./train.spacy")
convert(dev_data, "./dev.spacy")
convert(test_data, "./test.spacy")

In [14]:
!python3 -m spacy init config --lang en --pipeline textcat --optimize efficiency --force config.cfg

2023-03-15 12:21:09.684878: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-15 12:21:10.200898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:21:10.200957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:21:11.160686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-15 12:21:11.160748: W tensorflow/compiler/xla/stream_execut

Ver documentación de config en: https://spacy.io/usage/training#quickstart

Ver documentación de architectures en: https://spacy.io/api/architectures

In [15]:
!python3 -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output model --verbose

2023-03-15 12:21:13.718929: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-15 12:21:14.212680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:21:14.212745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:21:15.082407: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-15 12:21:15.082472: W tensorflow/compiler/xla/stream_execut

In [16]:
!python3 -m spacy evaluate ./model/model-best/ ./test.spacy

2023-03-15 12:26:19.385040: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-15 12:26:19.867377: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:26:19.867427: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-15 12:26:20.734847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-15 12:26:20.734905: W tensorflow/compiler/xla/stream_execut

In [17]:
texts = ["This movie is unnecessarily long. At times it gets boring and hard to follow.", "I regretted ever purchasing or making order on this platform."]
nlp = spacy.load("./model/model-best")
for text in texts:
    doc = nlp(preprocessing(text))
    print(doc.cats,  "-",  text)

{'POS': 0.1095241829752922, 'NEG': 0.8904758095741272} - This movie is unnecessarily long. At times it gets boring and hard to follow.
{'POS': 0.44296127557754517, 'NEG': 0.5570387244224548} - I regretted ever purchasing or making order on this platform.
