## 1. Import libraries

In [1]:
from collections import Counter 
import nltk
import spacy
import re
from sqlalchemy import create_engine 
import pandas as pd
!python -m spacy download en
import numpy as np

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## 2. Data pre-processing for Cornell Movie--Dialogs Corpus data

In [2]:
# load movie data 
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

movie_df = pd.read_sql_query('select * from dialogs',con=engine)


engine.dispose()

Also available at: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [7]:
movie_df.head()

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...
4,4,You're asking me out. That's so cute. What's ...


In [4]:
movie_df.shape

(304446, 2)

In [11]:
# the dataset is large and it was not able to run with the available computer's resources. So, I have resample
# the data to 40%
sample_movie = movie_df.sample(frac = 0.4, replace = False, random_state = 44)

In [12]:
sample_movie.head()

Unnamed: 0,index,dialogs
179597,179597,"Is that a trick question? C'mon, I can't be t..."
131002,131002,"Maybe, but someone's got to make a stand."
225173,225171,And the hat. But she is a witch.
48779,48778,All I'm saying is they've got people who handl...
205854,205854,We're going to have to cover the entire lagoon.


In [13]:
sample_movie.shape

(121778, 2)

In [14]:
nlp = spacy.load('en_core_web_sm')#, disable=['parser', 'ner'])

nlp.max_length = 20000000

dialogs_doc = nlp(' '.join(sample_movie.dialogs), disable=['parser', 'ner'])

In [15]:
# let's explore the objects we've built.
print("The dialogs_doc object is a {} object.".format(type(dialogs_doc)))
print("It is {} tokens long".format(len(dialogs_doc)))
print("The first three tokens are '{}'".format(dialogs_doc[:3]))
print("The type of each token is {}".format(type(dialogs_doc[0])))

The dialogs_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 1702001 tokens long
The first three tokens are 'Is that a'
The type of each token is <class 'spacy.tokens.token.Token'>


In [16]:
# removing the stopwords from movie dataset
dialogs_without_stopwords = [token for token in dialogs_doc if not token.is_stop]

In [17]:
# lemmatization of the movie dataset
lemmas = [token.lemma_ for token in dialogs_without_stopwords]

## 3. Data pre-processing Twitter US Airline Sentiment

In [18]:
# load twitter dataset 
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'twitter_sentiment'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

twitter_df = pd.read_sql_query('select * from twitter',con=engine)


engine.dispose()

Also available at: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [3]:
twitter_df.head()

Unnamed: 0,index,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [19]:
twitter_df.shape

(14640, 16)

In [20]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# avoid memory error of SpaCy
nlp.max_length = 20000000

# processing task
twitter_doc = nlp(' '.join(twitter_df.text))

In [21]:
# Explore the objects of twitter_doc
print('The alice_doc object is a {} object.'.format(type(twitter_doc)))
print('It is {} tokens long.'.format(len(twitter_doc)))
print('The first three tokens are {}'.format(twitter_doc[:3])) 
print('The type of each token is {}'.format(type(twitter_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 307359 tokens long.
The first three tokens are @VirginAmerica What @dhepburn
The type of each token is <class 'spacy.tokens.token.Token'>


In [22]:
# removing the stopwords from twitter df
tweets_without_stopwords = [token for token in twitter_doc if not token.is_stop]

In [23]:
# lemmatization of twitts without stopwords
lemmas = [token.lemma_ for token in tweets_without_stopwords]