# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- lime 0.2.0.1
- spacy 3.5.1
- gcsfs 2023.3.0
- protobuf 3.20.3


In [1]:
from nltk.corpus import product_reviews_1
camera_reviews = product_reviews_1.reviews('Canon_G3.txt')

reviews = []

for review in camera_reviews:
    sentences = []
    for sentence in review.sents():
        text = " ".join(sentence)
        sentences.append(text)
    document = " ".join(sentences)
    reviews.append(document)


In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens
    
    text = ' '.join(words)
    
    return text

In [3]:
corpus = []

for review in reviews:
    document = tokenize(review)
    corpus.append(document)


In [6]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...


True

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer() # compound in [-1,1]

Ver documentación de vader en: https://www.nltk.org/api/nltk.sentiment.vader.html

In [5]:
label = []
for i in range(len(corpus)):
    if analyzer.polarity_scores(corpus[i])['compound'] > 0.2: 
        label.append('Positive') # positive sentiment
    elif analyzer.polarity_scores(corpus[i])['compound'] < -0.2:
        label.append('Negative') # negative sentiment
    else:
        label.append('Neutral') # neutral sentiment

In [6]:
import pandas as pd

df = pd.DataFrame(corpus, columns = ['review'])
df['polarities'] = label
df

Unnamed: 0,review,polarities
0,recently purchased canon powershot extremely s...,Positive
1,yep first digital camera toy software engineer...,Positive
2,extensive research comparing different megapix...,Positive
3,bought canon month ago say satisfied taken hun...,Positive
4,camera one full day say wonderful photo qualit...,Positive
5,positive slr like programming exposure control...,Positive
6,camera wonderful set feature lcd screen pull r...,Positive
7,recent price drop made best bargain digital ca...,Positive
8,recommend unreservedly powershot potential buy...,Positive
9,else say camera work make photograph work want...,Positive


# Supervised sentiment analysis (training)

In [7]:
# load in dataset and separate by the __label__ classifier in the text file
data = pd.read_csv('gs://nlp_amazon_data/train.ft.txt', sep="__label__", header = None)

  data = pd.read_csv('gs://nlp_amazon_data/train.ft.txt', sep="__label__", header = None)


In [8]:
data.drop(0, inplace=True, axis=1)
data['sentiment'] = data[1].str[0]
data[1] = data[1].str[2:]
data = data.rename(columns={1: 'review'})
data

Unnamed: 0,review,sentiment
0,Stuning even for the non-gamer: This sound tra...,2
1,The best soundtrack ever to anything.: I'm rea...,2
2,Amazing!: This soundtrack is my favorite music...,2
3,Excellent Soundtrack: I truly like this soundt...,2
4,"Remember, Pull Your Jaw Off The Floor After He...",2
...,...,...
3599995,Don't do it!!: The high chair looks great when...,1
3599996,"Looks nice, low functionality: I have used thi...",1
3599997,"compact, but hard to clean: We have a small ho...",1
3599998,what is it saying?: not sure what this book is...,1


In [9]:
sample = data.sample(n=100000)

!python3 -m spacy download en_core_web_sm

In [10]:
import string
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.blank("en") # Create a blank pipeline of a given language class
REGX_USERNAME = r"@[A-Za-z0-9$-_@.&+]+"

def preprocessing(text):
  text = text.lower()
  text = re.sub(REGX_USERNAME, ' ', text)
  tokens = [token.text for token in nlp(text)]
  tokens = [t for t in tokens if t not in STOP_WORDS and t not in string.punctuation and len(t) > 2]
  tokens = [t for t in tokens if not t.isdigit()]

  return " ".join(tokens)

sample["text_clean"] = sample["review"].apply(preprocessing)
sample.head()

Unnamed: 0,review,sentiment,text_clean
1669564,Awesome!: These are Great! Bought these for my...,2,awesome great bought daughter boyfriend loves ...
3314648,Shame on you Lewin!: Corrupting the integrity ...,1,shame lewin corrupting integrity amazon review...
1226503,ultimate christmas c d volumn 3: here we go ag...,2,ultimate christmas volumn great christmas favo...
709998,Stand has a major defect: I bought this monito...,1,stand major defect bought monitor months ago s...
360808,"Track After Track, On Repeat, Delicious to the...",2,track track repeat delicious ears feet lies li...


In [11]:
sample

Unnamed: 0,review,sentiment,text_clean
1669564,Awesome!: These are Great! Bought these for my...,2,awesome great bought daughter boyfriend loves ...
3314648,Shame on you Lewin!: Corrupting the integrity ...,1,shame lewin corrupting integrity amazon review...
1226503,ultimate christmas c d volumn 3: here we go ag...,2,ultimate christmas volumn great christmas favo...
709998,Stand has a major defect: I bought this monito...,1,stand major defect bought monitor months ago s...
360808,"Track After Track, On Repeat, Delicious to the...",2,track track repeat delicious ears feet lies li...
...,...,...,...
1765260,Not Recommended: Many of the recipes in this b...,1,recommended recipes book include ingredients b...
2508426,Their Greatest!: This is George Michael/Wham's...,2,greatest george michael wham greatest album tr...
3098618,Cadburys Flake Case of 24: Very good product. ...,2,cadburys flake case good product family loves ...
1203438,Can server the purpose: I bought two of these ...,1,server purpose bought items small work suggest...


In [12]:
dataset = list(sample[["text_clean", "sentiment"]].sample(frac=1).itertuples(index=False, name=None))
train_data = dataset[:75000]  # 75%
dev_data = dataset[75000:90000] # 15%
test_data = dataset[90000:] # 10%

In [13]:
def convert(data, outfile):
    db = spacy.tokens.DocBin()
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        doc.cats["POS"] = label == '2'
        doc.cats["NEG"] = label == '1'
        db.add(doc)
    
    db.to_disk(outfile)
convert(train_data, "./train.spacy")
convert(dev_data, "./dev.spacy")
convert(test_data, "./test.spacy")

In [14]:
!python3 -m spacy init config --lang en --pipeline textcat --optimize efficiency --force config.cfg

2023-03-16 14:29:55.174794: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-16 14:29:55.690500: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:29:55.690563: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:29:56.644307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-16 14:29:56.644367: W tensorflow/compiler/xla/stream_execut

Ver documentación de config en: https://spacy.io/usage/training#quickstart

Ver documentación de architectures en: https://spacy.io/api/architectures

In [15]:
!python3 -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output model --verbose

2023-03-16 14:29:59.199420: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-16 14:29:59.711462: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:29:59.711518: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:30:00.632075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-16 14:30:00.632133: W tensorflow/compiler/xla/stream_execut

In [16]:
!python3 -m spacy evaluate ./model/model-best/ ./test.spacy

2023-03-16 14:34:21.918688: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-16 14:34:22.434323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:34:22.434376: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-16 14:34:23.339304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-16 14:34:23.339359: W tensorflow/compiler/xla/stream_execut

In [17]:
texts = ["This movie is unnecessarily long. At times it gets boring and hard to follow.", "I regretted ever purchasing or making order on this platform."]
nlp = spacy.load("./model/model-best")
for text in texts:
    doc = nlp(preprocessing(text))
    print(doc.cats,  "-",  text)

{'POS': 0.08944917470216751, 'NEG': 0.9105508327484131} - This movie is unnecessarily long. At times it gets boring and hard to follow.
{'POS': 0.3397209644317627, 'NEG': 0.6602790355682373} - I regretted ever purchasing or making order on this platform.
