# Intro 2 Text Mining

In [18]:
# Importing the libraries
import numpy as np
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/uzaycetin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Unpickling Movie Review dataset

In [19]:
# Unpickling dataset
X_in = open('uX.pickle','rb')
y_in = open('uy.pickle','rb')
X = pickle.load(X_in)
y = pickle.load(y_in)

In [20]:
type(X), len(X)

(list, 2000)

In [22]:
X[10]

b'it\'s ironic that the best films in cinema history are invariably the original director\'s cut of the film . \nfilms such as aliens , the abyss , the wild bunch , blade runner , and terminator 2 are all prime examples of a filmmaker\'s integrity , later chopped up or mucked with by the studio . \nthe advent of the dvd format has provided a more accessible way to get these original cuts to the public and provide to film freaks like myself the ability to become further enraptured by the extension of such classic films . \nthe dvd release of the original international version of luc besson\'s 1995 masterpiece the professional , which is known as l ? on around the world , is a prime example of how a good film can become an instant classic as a director\'s cut . \nfor years , i have heard of an " international " version available only in laserdisc format , which has eluded me for years . \ni even bought a laserdisc player from my uncle don for 100 bucks just to watch certain directors\' c

In [24]:
y[10]

1

In [25]:
# Creating the corpus
corpus = []
for i in range(0, 2000):
    review = re.sub(r'\W', ' ', str(X[i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)  

In [38]:
corpus[5]

'don know how many other people have had the idea cross their mind that their life could be an ongoing television show watched by another world of people but it something used to wonder about when was younger ni can decide if first thought it because watched lot of tv or because my brother hit me in the head with baseball bat but m pretty sure andrew niccol screenwriter for the truman show has had the same curious thought nthe truman show is about man jim carrey whose entire life has been engineered by corporation and marketed to the public nsince birth he been living in the fictional island town of seahaven fla which actually exists as giant domed set just beyond the hollywood sign nall the people in truman burbank life are actors and the anonymous townfolk paid extras npeople watch truman life 24 hours day live with no commercial interruptions nrevenue comes instead from product placement staple of contemporary hollywood with truman friends and relatives describing their consumer ite

# Bag of Words 

In [26]:
# Creating the BOW model
#
# min_df = 3      a word should pass at least 3 doc
# max_df = 0.6    words that pass in more than %60 of words is also eliminated

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 2000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

In [27]:
X.shape

(2000, 2000)

In [29]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [34]:
#vectorizer.vocabulary_

In [8]:
import pandas as pd
words = vectorizer.vocabulary_
df = pd.DataFrame(data = list(words.keys()), index = list(words.values()) )
df.columns = ['ID']
df.head()

Unnamed: 0,ID
103,arnold
24,action
1602,since
946,late
632,films


In [36]:
max(vectorizer.vocabulary_.values())

1999

# TF-IDF Model

In [39]:
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

In [40]:
X.shape

(2000, 2000)

In [43]:
X[0,:10]

array([0.        , 0.        , 0.        , 0.        , 0.06635601,
       0.        , 0.        , 0.        , 0.        , 0.        ])

# Train Test Split

In [44]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [45]:
text_train.shape

(1600, 2000)

# Training the Classifier

In [46]:
# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(text_train,sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Evaluation

In [48]:
# Testing model performance
sent_pred = classifier.predict(text_test)


from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(sent_test, sent_pred)

In [49]:
cm

array([[168,  40],
       [ 21, 171]])

In [50]:
accuracy_score(sent_test, sent_pred)

0.8475

# Save Classifier Model

In [51]:
# Saving our classifier
with open('myclassifier.pickle','wb') as f:
    pickle.dump(classifier,f)
    
# Saving the Tf-Idf model
with open('mytfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)