# Working with text data
## Pandas string functions

In [317]:
import pandas as pd
import numpy as np

s = pd.Series(['0', 'John Wood', 'Colin Welsh', 'my list', '02456', np.nan, 'HELLO WORLD', 'water%'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentinmonney/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# lowercase
s.str.lower()

0              0
1      john wood
2    colin welsh
3        my list
4          02456
5            NaN
6    hello world
7         water%
dtype: object

In [4]:
s.str.upper()

0              0
1      JOHN WOOD
2    COLIN WELSH
3        MY LIST
4          02456
5            NaN
6    HELLO WORLD
7         WATER%
dtype: object

In [5]:
s.str.len()

0     1.0
1     9.0
2    11.0
3     7.0
4     5.0
5     NaN
6    11.0
7     6.0
dtype: float64

In [6]:
# split when we have a specific character
s.str.split(' ')

0               [0]
1      [John, Wood]
2    [Colin, Welsh]
3        [my, list]
4           [02456]
5               NaN
6    [HELLO, WORLD]
7          [water%]
dtype: object

In [7]:
# return a dataframe instead of a list
substrings = s.str.split(' ', expand=True)
substrings

Unnamed: 0,0,1
0,0,
1,John,Wood
2,Colin,Welsh
3,my,list
4,02456,
5,,
6,HELLO,WORLD
7,water%,


In [8]:
substrings[1]

0     None
1     Wood
2    Welsh
3     list
4     None
5      NaN
6    WORLD
7     None
Name: 1, dtype: object

In [9]:
#replace a character
s.str.replace('%', ' percent ')

0                 0
1         John Wood
2       Colin Welsh
3           my list
4             02456
5               NaN
6       HELLO WORLD
7    water percent 
dtype: object

In [10]:
# remove a character
s.str.replace('%', '')

0              0
1      John Wood
2    Colin Welsh
3        my list
4          02456
5            NaN
6    HELLO WORLD
7          water
dtype: object

In [11]:
# slice each string with a specific index
s.str[0:2]

0      0
1     Jo
2     Co
3     my
4     02
5    NaN
6     HE
7     wa
dtype: object

In [12]:
# alternative to slicing
s.str.slice(0,2)

0      0
1     Jo
2     Co
3     my
4     02
5    NaN
6     HE
7     wa
dtype: object

In [13]:
# combination of slicing and replacing
s.str.slice_replace(0,2, '___')

0             ___
1      ___hn Wood
2    ___lin Welsh
3        ___ list
4          ___456
5             NaN
6    ___LLO WORLD
7         ___ter%
dtype: object

In [14]:
# check if contains a specific pattern
flag = s.str.contains('0')
flag

0     True
1    False
2    False
3    False
4     True
5      NaN
6    False
7    False
dtype: object

In [15]:
# the same without the Nan
flag = s.str.contains('0', na=False)
flag

0     True
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

In [16]:
s[flag]

0        0
4    02456
dtype: object

## Example: cleaning up the movies dataset

In [180]:
# Load the data
movies = pd.read_csv("tmdb_5000_movies.csv")
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [181]:
genres = movies['genres']
genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [182]:
import json

json_obj = json.loads(genres[0]) # Load json string
json_obj

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [183]:
names = [x["name"] for x in json_obj]
names

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [184]:
# join list of words into a string
', '.join(names)

'Action, Adventure, Fantasy, Science Fiction'

In [185]:
# Let's do it manualla by first striping the bracket squares
def transform(s):
    s = s.str.strip("[]")
    s = s.str.replace("{", "", regex=True)
    s = s.str.replace("}", "", regex=True)
    s = s.str.replace(",", "", regex=True)
    s = s.str.replace('"id":', "", regex=True)
    s = s.str.replace('"name":', "", regex=True)
    s = s.str.replace('"', "", regex=True)
    s = s.str.replace("0", "", regex=True)
    s = s.str.replace("1", "", regex=True)
    s = s.str.replace("2", "", regex=True)
    s = s.str.replace("3", "", regex=True)
    s = s.str.replace("4", "", regex=True)
    s = s.str.replace("5", "", regex=True)
    s = s.str.replace("6", "", regex=True)
    s = s.str.replace("7", "", regex=True)
    s = s.str.replace("8", "", regex=True)
    s = s.str.replace("9", "", regex=True)
    s = s.str.replace('    ', ', ')
    s = s.str.replace('   ','')
    return s

In [186]:
genres = transform(genres)
genres[0]

'Action, Adventure, Fantasy, Science Fiction'

In [40]:
# regular expressions to remove digits:
s = s.str.replace('[0-9]+','', regex = True)
s = s.str.replace('\d+','', regex = True)

In [187]:
movies['genres'] = genres

In [188]:
movies.loc[:,['title', 'genres']].head(10)

Unnamed: 0,title,genres
0,Avatar,"Action, Adventure, Fantasy, Science Fiction"
1,Pirates of the Caribbean: At World's End,"Adventure, Fantasy, Action"
2,Spectre,"Action, Adventure, Crime"
3,The Dark Knight Rises,"Action, Crime, Drama, Thriller"
4,John Carter,"Action, Adventure, Science Fiction"
5,Spider-Man 3,"Fantasy, Action, Adventure"
6,Tangled,"Animation, Family"
7,Avengers: Age of Ultron,"Action, Adventure, Science Fiction"
8,Harry Potter and the Half-Blood Prince,"Adventure, Fantasy, Family"
9,Batman v Superman: Dawn of Justice,"Action, Adventure, Fantasy"


## Exercise: further practice with the movies dataset

In [189]:
keywords = transform(movies.keywords)
keywords.head()

0    culture clash, future, space war, space colony...
1    ocean, drug abuse, exotic island, east india t...
2    spy, based on novel, secret agent, sequel, mi,...
3    dc comics, crime fighter, terrorist, secret id...
4    based on novel, mars, medallion, space travel,...
Name: keywords, dtype: object

In [193]:
keywords_df = keywords.str.split(',', expand=True)
keywords_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,culture clash,future,space war,space colony,society,space travel,futuristic,romance,space,alien,...,,,,,,,,,,
1,ocean,drug abuse,exotic island,east india trading company,love of one's life,traitor,shipwreck,strong woman,ship,alliance,...,,,,,,,,,,
2,spy,based on novel,secret agent,sequel,mi,british secret service,united kingdom,,,,...,,,,,,,,,,
3,dc comics,crime fighter,terrorist,secret identity,burglar,hostage drama,time bomb,gotham city,vigilante,cover-up,...,,,,,,,,,,
4,based on novel,mars,medallion,space travel,princess,alien,steampunk,martian,escape,edgar rice burroughs,...,,,,,,,,,,


In [197]:
movies.keywords = keywords_df[0]+', '+keywords_df[1]+', '+keywords_df[2]

In [199]:
movies.keywords.head()

0       culture clash,  future,  space war
1       ocean,  drug abuse,  exotic island
2      spy,  based on novel,  secret agent
3    dc comics,  crime fighter,  terrorist
4        based on novel,  mars,  medallion
Name: keywords, dtype: object

## Regular expressions

In [200]:
s=pd.Series(['0', 'John Wood', 'Colin Welsh', 'my list', '02456', np.nan, 'HELLO WORLD', 'water%'])

In [201]:
s.str.contains('John')

0    False
1     True
2    False
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [202]:
s.str.contains('John') | s.str.contains('Colin')

0    False
1     True
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [203]:
s.str.contains('John|Colin')

0    False
1     True
2     True
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [204]:
s2 = pd.Series(['bar', 'sugar', 'cartoon', 'argon'])

In [205]:
s2.str.contains('.ar')

0     True
1     True
2     True
3    False
dtype: bool

In [206]:
s2.str.contains('[bc]ar')

0     True
1    False
2     True
3    False
dtype: bool

In [207]:
s[s.str.contains('[0-9]', na=False)]

0        0
4    02456
dtype: object

In [208]:
s[s.str.contains('[\d]', na=False)]

0        0
4    02456
dtype: object

In [209]:
s2[s2.str.contains('^[bc]', na=False)]

0        bar
2    cartoon
dtype: object

In [210]:
s2[s2.str.contains('ar$', na=False)]

0      bar
1    sugar
dtype: object

In [211]:
s3= pd.Series(['forest', 'o', 'ff', 'foo', 'fof'])
s3.str.contains('f+o?f+')

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [212]:
s4= pd.Series(['Monday5km', 'Wednesday10km', 'Saturday25km'])

In [213]:
# Extract weekday names in a new column
s4.str.extract("(\w+day)", expand=True)

Unnamed: 0,0
0,Monday
1,Wednesday
2,Saturday


In [214]:
# Extract weekday names and distances in km in separate columns
s4.str.extract("(\w+day)(\d+km)", expand=True)

Unnamed: 0,0,1
0,Monday,5km
1,Wednesday,10km
2,Saturday,25km


In [215]:
# Define string sample
sample = 'Monday5km'
sample
# 'Monday5km'

'Monday5km'

In [216]:
# Import re library
import re

# Match groups according to regex pattern
m = re.match('(\w+day)(\d+km)', # regex pattern
             sample              # string sample
            )

# Show matched groups
m.groups()
# ('Monday', '5km')

('Monday', '5km')

In [217]:
# Show first matched group
m.groups()[0]
# 'Monday'

# Show second matched group
m.groups()[1]

'5km'

In [218]:
m.groups()[0][:3]

'Mon'

In [219]:
def f(x):
    return x.groups()[0][:3]

In [220]:
s4.str.replace("(\w+day)",
               f,           
               regex=True
              )

0     Mon5km
1    Wed10km
2    Sat25km
dtype: object

## Exercise: using regular expressions in pandas

In [221]:
meal_plan = ['Monday: 9:12am – Omelet,  3:30pm– Apple slices with almond butter', 
             'Tuesday: 9:35am – Banana bread, 11:00am –Sauteed veggies, 7:02pm– Taco pie',
             'Wednesday: 9:00am – Banana pancakes',  
             'Thursday: 7:23pm– Slow cooker pulled pork', 'Friday: 3:30pm – Can of tuna', 
             'Saturday: 9:11am: Eggs and sweet potato hash browns, 3:22pm: Almonds', 
             'Sunday: 11:00am: Meat and veggie stir fry'] 

In [233]:
df = pd.DataFrame(meal_plan, columns=['text'])
df

Unnamed: 0,text
0,"Monday: 9:12am – Omelet, 3:30pm– Apple slices..."
1,"Tuesday: 9:35am – Banana bread, 11:00am –Saute..."
2,Wednesday: 9:00am – Banana pancakes
3,Thursday: 7:23pm– Slow cooker pulled pork
4,Friday: 3:30pm – Can of tuna
5,Saturday: 9:11am: Eggs and sweet potato hash b...
6,Sunday: 11:00am: Meat and veggie stir fry


In [296]:
weekdays = df['text'].str.extractall(r'(?P<weekdays>\w+day)') # to name a column
days = weekdays['weekdays'].str[:3]
meals = ['breakfast', 'lunch', 'dinner']

In [304]:
sol = df['text'].str.extractall("(\d?\d):(\d\d) ?([ap]m)")
sol.index.set_levels([days, meals], inplace = True)
sol.index.set_names(["Day", "Meal"], inplace = True)
sol.columns = ['Hour', 'Minutes', 'Period']
sol

Unnamed: 0_level_0,Unnamed: 1_level_0,Hour,Minutes,Period
Day,Meal,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mon,breakfast,9,12,am
Mon,lunch,3,30,pm
Tue,breakfast,9,35,am
Tue,lunch,11,0,am
Tue,dinner,7,2,pm
Wed,breakfast,9,0,am
Thu,breakfast,7,23,pm
Fri,breakfast,3,30,pm
Sat,breakfast,9,11,am
Sat,lunch,3,22,pm


## Sentiment Analysis

In [318]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentinmonney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [309]:
df = pd.read_csv('tweets.csv', header = None)
df.columns = ['sentiment', 'text']
df.head()

Unnamed: 0,sentiment,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [314]:
#total number of tweets
df.shape[0]
#number of positive tweets
df[df['sentiment'] == 4].shape[0]
#number of neutral tweets
df[df['sentiment'] == 2].shape[0]
#number of negative tweets
df[df['sentiment'] == 0].shape[0]

177

In [316]:
pos_tweets = df.loc[df['sentiment'] == 4, 'text']
neg_tweets = df.loc[df['sentiment'] == 0, 'text']

In [325]:
stopwords_english = stopwords.words('english')
stopwords_english

# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
    
    # remove hyperlinks
    tweet = re.sub(pattern = r'https?:\/\/[^\s]+', repl='', string= tweet)
        
    # remove hashtags
    tweet = re.sub(r'#', '', tweet)  
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case = False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if( word not in stopwords_english and # removes stopwords
            #word not in emoticons and # removes emoticons
            word not in string.punctuation): # removes punctuation)
            tweets_clean.append(word)
           
    return(tweets_clean)

In [327]:
sample = pos_tweets.iloc[4]
sample

"@mikefish  Fair enough. But i have the Kindle2 and I think it's perfect  :)"

In [328]:
clean_tweets(sample)

['fair', 'enough', 'kindle', '2', 'think', 'perfect']

In [356]:
# defining our bag of words model
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word,True] for word in words)
    return words_dictionary

In [357]:
bag_of_words(sample)

{'fair': True,
 'enough': True,
 'kindle': True,
 '2': True,
 'think': True,
 'perfect': True}

In [358]:
# Creating a feature set of positive tweets
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))
    
# Creating a feature set of negative tweets
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

tweets = pos_tweets_set + neg_tweets_set

In [359]:
# implement the Naive Bayes algorithm
from random import shuffle
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:36] + neg_tweets_set[:36]
train_set = pos_tweets_set[36:] + neg_tweets_set[:36]

In [360]:
from nltk import classify
from nltk import NaiveBayesClassifier

# Classify our train data with the Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(train_set)

In [361]:
# test the accuraca of our classifier
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8472222222222222

In [362]:
# show the most importang features 
classifier.show_most_informative_features(10)

Most Informative Features
                    hate = True              neg : pos    =      9.3 : 1.0
                      .. = True              neg : pos    =      9.3 : 1.0
                    fail = True              neg : pos    =      6.6 : 1.0
                    time = True              neg : pos    =      6.6 : 1.0
                customer = True              neg : pos    =      6.6 : 1.0
                    even = True              neg : pos    =      4.0 : 1.0
                   would = True              neg : pos    =      4.0 : 1.0
                  office = True              neg : pos    =      4.0 : 1.0
                   voice = True              neg : pos    =      4.0 : 1.0
                  higher = True              neg : pos    =      4.0 : 1.0


In [343]:
from collections import defaultdict
from nltk.metrics import ConfusionMatrix

In [352]:
actual_set_cm = []
predicted_set_cm = []

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
    
    predicted_label = classifier.classify(feature)
    
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    
print(ConfusionMatrix(actual_set_cm, predicted_set_cm))
    

    |  n  p |
    |  e  o |
    |  g  s |
----+-------+
neg |<36> . |
pos |  5<31>|
----+-------+
(row = reference; col = test)



['pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg']