# Natural Language Processing

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vicke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vicke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vicke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
kittycat = 'We are all agreeing with the cats on this one, and she is too!'

### Basic Cleanup

In [3]:
import re
def clean_up(text):
    import re
    text = re.sub('[^A-Za-z0-9 ]','',text)
    text = text.lower().strip()
    return text
kittycat_clean = clean_up(kittycat)
print(kittycat_clean)

we are all agreeing with the cats on this one and she is too


### Tokenization

In [4]:
from nltk.tokenize import word_tokenize
kittycat_tokenize = word_tokenize(kittycat_clean)
print(kittycat_tokenize)

['we', 'are', 'all', 'agreeing', 'with', 'the', 'cats', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [6]:
kittycat_lemmatize = [lemmatizer.lemmatize(item) for item in kittycat_tokenize]
print(kittycat_lemmatize)

['we', 'are', 'all', 'agreeing', 'with', 'the', 'cat', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Stemming

In [7]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [8]:
kittycat_stem = [stemmer.stem(item) for item in kittycat_lemmatize]
print(kittycat_stem)

['we', 'are', 'all', 'agre', 'with', 'the', 'cat', 'on', 'this', 'one', 'and', 'she', 'is', 'too']


### Removing Stopwords

In [9]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")
kittycat_nostopwords = [item for item in kittycat_stem if not item in stopwords_list]
print(kittycat_nostopwords)

['agre', 'cat', 'one']


### Vectorizing Text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ------

In [11]:
vectorizer.fit_transform(kittycat_nostopwords).toarray()

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=int64)

## Applying to real data: IMDB movie reviews

Get the data from here: http://ai.stanford.edu/~amaas/data/sentiment/

An example walkthrough: https://dropsofai.com/sentiment-analysis-with-python-bag-of-words/

In [12]:
from pathlib import Path

# reading positive reviews
txt_folder = Path('aclImdb/train/pos').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
pos = pd.DataFrame(content)

In [13]:
# reading negative reviews
txt_folder = Path('aclImdb/train/neg').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
neg = pd.DataFrame(content)

In [14]:
# we will try to predict whether a review is positive
pos['target'] = 1
neg['target'] = 0

In [15]:
# putting both dataframes together
df = pd.concat([pos, neg], axis = 0)
df.rename(columns = {0:'review'}, inplace = True)

In [16]:
# the dataset is very large, so we are only taking a subset for analysis
df = df.sample(frac=0.25)

## Preparing the data

In [17]:
df['target'].value_counts()

Series([], Name: target, dtype: int64)

In [18]:
df['review_clean'] = df['review'].apply(clean_up)
df.head()

KeyError: 'review'

## Tokenization

In [None]:
from nltk.tokenize import word_tokenize
df['review_tokenize'] = df['review_clean'].apply(word_tokenize)
df.head()

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df['review_lemmatize'] = df['review_tokenize'].apply(lambda row: [lemmatizer.lemmatize(item) for item in row])
df.head()

## Stemming

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

df['review_stem'] = df['review_lemmatize'].apply(lambda row: [stemmer.stem(item) for item in row])
df.head()

## Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

df['review_nostopwords'] = df['review_stem'].apply(lambda row: [item for item in row if not item in stopwords_list])
df.head()

## Vectorizing Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x)

In [None]:
X = vectorizer.fit_transform(df['review_nostopwords']).toarray()

## Splitting into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, pred))
print(recall_score(y_test, pred))
print(f1_score(y_test, pred))

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, pred)