<a href="https://colab.research.google.com/github/yulucy19/Thinkful-Project-2019/blob/master/Text_preprocessing_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import Counter
import nltk
import spacy
import re
from sqlalchemy import create_engine
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

dialogs_df = pd.read_sql_query('select * from dialogs',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [4]:
dialogs_df.head(10)

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...
4,4,You're asking me out. That's so cute. What's ...
5,5,Forget it.
6,6,"No, no, it's my fault -- we didn't have a prop..."
7,7,Cameron.
8,8,"The thing is, Cameron -- I'm at the mercy of a..."
9,9,Seems like she could get a date easy enough...


In [5]:
dialogs_df.shape

(304446, 2)

In [6]:
dialogs_df['dialogs'].head(2)

0    Can we make this quick?  Roxanne Korrine and A...
1    Well, I thought we'd start with pronunciation,...
Name: dialogs, dtype: object

### Preprocessing

In [0]:
# Make text lowercase. remove punctuation
dialogs_df['dialogs'] = dialogs_df['dialogs'].str.lower()

### Tokenization

In [0]:
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# below is necessary to avoid memory error of SpaCy
nlp.max_length = 20000000

# all the processing work is done below, so it may take a while
dialogs_doc = nlp(" ".join(dialogs_df.dialogs))

In [9]:
# let's explore the objects we've built.
print("The dialogs_doc object is a {} object.".format(type(dialogs_doc)))
print("It is {} tokens long".format(len(dialogs_doc)))
print("The first three tokens are '{}'".format(dialogs_doc[:3]))
print("The type of each token is {}".format(type(dialogs_doc[0])))

The dialogs_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 4279540 tokens long
The first three tokens are 'can we make'
The type of each token is <class 'spacy.tokens.token.Token'>


### Removing Stopwords

In [0]:
dialogs_without_stopwords = [token for token in dialogs_doc if not token.is_stop]

In [11]:
# utility function to calculate how frequently words appear in the text
def word_frequencies(text):
    
    # build a list of words
    # strip out punctuation
    words = []
    for token in text:
        if not token.is_punct:
            words.append(token.text)
            
    # build and return a Counter object containing word counts
    return Counter(words)

# instantiate our list of most common words.
dialogs_word_freq = word_frequencies(dialogs_without_stopwords).most_common(10)
print('\ndialogs:', dialogs_word_freq)



dialogs: [(' ', 83751), ('know', 21613), ('like', 15026), ('got', 13284), ('want', 11042), ('think', 10761), ('right', 10031), ('going', 8869), ('oh', 7918), ('good', 7465)]


### Lemmatization

In [14]:
# utility function to calculate how frequently lemas appear in the text
def lemma_frequencies(text):
    
    # build a list of lemas
    # strip out punctuation
    lemmas = []
    for token in text:
        if not token.is_punct:
            lemmas.append(token.lemma_)
            
    # build and return a Counter object containing lemma counts
    return Counter(lemmas)

# instantiate our list of most common words.
dialogs_lemma_freq = lemma_frequencies(dialogs_without_stopwords).most_common(10)
print('\ndialogs:', dialogs_lemma_freq)


dialogs: [(' ', 83751), ('know', 24766), ('go', 16895), ('get', 15820), ('like', 15620), ('think', 15068), ('want', 13983), ('come', 10616), ('tell', 10493), ('right', 10131)]
