In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('imdb_master.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [4]:
df_test = df[df["type"]=="test"]

In [5]:
df_test.shape

(25000, 5)

In [6]:
df_train = df[df["type"]=="train"] 

In [7]:
df_train.shape

(75000, 5)

In [8]:
X_train = df_train[["review"]].values
Y_train = df_train[["label"]].values

In [9]:
X_train.shape

(75000, 1)

In [10]:
X_test = df_test[["review"]].values
Y_test = df_test[["label"]].values

In [11]:
X_test.shape

(25000, 1)

# feature extraction

In [12]:
from nltk import TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [13]:
tb_tokenizer = TreebankWordTokenizer()

In [14]:
X_train[0][0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [15]:
def tokenizer(txt):
    tb_tokenizer = TreebankWordTokenizer()
    return tb_tokenizer.tokenize(txt)

In [16]:
def stemming(tokens):
    porter_stem = PorterStemmer()
    return " ".join(porter_stem.stem(token) for token in tokens)

In [17]:
def lemmatization(tokens):
    lem = WordNetLemmatizer()
    return " ".join(lem.lemmatize(token) for token in tokens)

In [18]:
def text_preprocess(txt, lemma=False):
    tokens = tokenizer(txt)
    if not lemma:
        return stemming(tokens)
    else:
        return lemmatization(tokens)

In [19]:
txt = "Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [20]:
text_preprocess(txt)

"stori of a man who ha unnatur feel for a pig. start out with a open scene that is a terrif exampl of absurd comedy. A formal orchestra audienc is turn into an insan , violent mob by the crazi chant of it 's singers. unfortun it stay absurd the whole time with no gener narr eventu make it just too off putting. even those from the era should be turn off. the cryptic dialogu would make shakespear seem easi to a third grader. On a technic level it 's better than you might think with some good cinematographi by futur great vilmo zsigmond. futur star salli kirkland and freder forrest can be seen briefli ."

In [21]:
text_preprocess(txt, True)

"Story of a man who ha unnatural feeling for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane , violent mob by the crazy chanting of it 's singers. Unfortunately it stay absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it 's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future star Sally Kirkland and Frederic Forrest can be seen briefly ."

In [22]:
m = X_train.shape[0]

In [23]:
for i in range(m):
    txt = X_train[i][0] 
    X_train[i][0] = text_preprocess(txt)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [25]:
tfidf = TfidfVectorizer(max_df=.5, min_df=1, ngram_range=(1,2))

In [26]:
A = np.asarray(tfidf.fit_transform(X_train.ravel()).todense())

In [32]:
A[23476].mean(axi)

4.799198059916133e-06

In [28]:
X_train

array([["stori of a man who ha unnatur feel for a pig. start out with a open scene that is a terrif exampl of absurd comedy. A formal orchestra audienc is turn into an insan , violent mob by the crazi chant of it 's singers. unfortun it stay absurd the whole time with no gener narr eventu make it just too off putting. even those from the era should be turn off. the cryptic dialogu would make shakespear seem easi to a third grader. On a technic level it 's better than you might think with some good cinematographi by futur great vilmo zsigmond. futur star salli kirkland and freder forrest can be seen briefli ."],
       ["airport '77 start as a brand new luxuri 747 plane is load up with valuabl paint & such belong to rich businessman philip steven ( jame stewart ) who is fli them & a bunch of vip 's to hi estat in prepar of it be open to the public as a museum , also on board is steven daughter juli ( kathleen quinlan ) & her son. the luxuri jetlin take off as plan but mid-air the plane 