In [1]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


# Twitter Sentiment Analysis Training Corpus

In [2]:
tweets = pd.read_csv('/home/yungshun/workspace/py3/feature-construction/datasets/twitter_sentiment_analysis_training_corpus/Sentiment Analysis Dataset.csv', encoding='latin1', error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [4]:
# del tweets['ItemID']
columns = ['ItemID', 'SentimentSource']
tweets.drop(columns, inplace=True, axis=1)

In [5]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [6]:
X = tweets['SentimentText']
y = tweets['Sentiment']

## Text-specific feature construction

The most common method to transform a corpus into a numerical representation, a process known as vectorization, is through a method called <b>bag-of-words</b>. The basic idea behind the bag of words approach is that documents are described by word occurences while completely ignoring the positioning of words in the document. In its simplest form, text is represented as a bag, without regard for grammar or word order, and is maintained as a set, with importance given to multiplicity.
1. <b>Tokenizing</b>
2. <b>Counting</b>
3. <b>Normalizing</b>

### CountVectorizer()

Similar to dummy variables, in the sense that <b>CountVectorizer</b> converts text columns into matrices where columns are tokens and cell values are counts of occurrences of each token in each document. The resulting matrix is referred to as a <b>document-term matrix</b> because each row will represent a <b>document</b> (in this case, a tweet) and each column represents a <b>term</b> (a word).
1. stop_words
2. min_df
3. max_df
4. ngram_range
5. analyzer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [8]:
# Without any parameter
vect = CountVectorizer()
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 683213)


In [9]:
# stop_words

In [10]:
# Removes a set of english stop words (if, a, the, etc)
vect = CountVectorizer(stop_words='english')
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 682902)


In [11]:
vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [12]:
# min_df

In [13]:
# Only includes words that occur in at least 5% of the corpus documents
vect = CountVectorizer(min_df=.05)
# Used to skim the number of features
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 30)


In [14]:
# max_df

In [15]:
# Only includes words that occur at most 80% of the documents
vect = CountVectorizer(max_df=.8)
# Used to "deduce" stop words
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 683213)


In [16]:
# ngram_range

In [17]:
# Also includes phrases up to 2 words
vect = CountVectorizer(ngram_range=(1, 2))
_ = vect.fit_transform(X)
# Explodes the number of features
print(_.shape)

(1578612, 5097548)


In [18]:
vect.get_feature_names()[:15]

['00',
 '00 00',
 '00 01',
 '00 01am',
 '00 01s',
 '00 02',
 '00 03',
 '00 05',
 '00 05am',
 '00 06',
 '00 09',
 '00 10',
 '00 12',
 '00 13',
 '00 14']

In [19]:
# lowercase

In [20]:
# Lower cases everything first
vect = CountVectorizer(lowercase=True)
_ = vect.fit_transform(X)
print(_.shape)  
# Features stays the same

(1578612, 683213)


In [21]:
# max_features

In [22]:
# Hard limits the features based on max counts
vect = CountVectorizer(max_features=1000)
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 1000)


In [23]:
# analyzer

In [24]:
# Default analyzer, decides to split into words
vect = CountVectorizer(analyzer='word')
_ = vect.fit_transform(X)
print(_.shape)  

(1578612, 683213)


In [25]:
# Used characters
vect = CountVectorizer(analyzer='char')
_ = vect.fit_transform(X)
print(_.shape)  

(1578612, 137)


In [26]:
vect.get_feature_names()[:10]

['\t', ' ', '!', '"', '#', '$', '%', '&', "'", '(']

In [27]:
# Uses characters again but only those are aren't at the beginning or ends of words
vect = CountVectorizer(analyzer='char_wb')
# wb stands for word boudnaries
_ = vect.fit_transform(X)
print(_.shape)

(1578612, 134)


In [28]:
# Make a custom analyzer

In [29]:
from nltk.stem.snowball import SnowballStemmer

  return f(*args, **kwds)


In [30]:
stemmer = SnowballStemmer('english')

In [31]:
stemmer.stem('hello')

'hello'

In [32]:
stemmer.stem('interesting')

'interest'

In [33]:
stemmer.stem('interesting') == stemmer.stem('interest')

True

In [34]:
# Define a function that accepts text and returns a list of lemmas
def word_tokenize(text, how='lemma'):
    # Tokenize into words
    words = text.split(' ')
    return [stemmer.stem(word) for word in words]

In [35]:
word_tokenize("hello you are very interesting")

['hello', 'you', 'are', 'veri', 'interest']

In [36]:
vect = CountVectorizer(analyzer=word_tokenize)
_ = vect.fit_transform(X)
print(_.shape)  
# Fewer features as stemming makes words smaller

(1578612, 1130234)


### TfidfVectorizer()

<b>tf</b>: term frequency<br/>
<b>idf</b>: inverse document frequency

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
vect = CountVectorizer()
_ = vect.fit_transform(X)
print(_.shape, _[0,:].mean())

vect = TfidfVectorizer()
_ = vect.fit_transform(X)
# Same number of rows and columns, different cell values
print(_.shape, _[0,:].mean())

(1578612, 683213) 1.0245706682981736e-05
(1578612, 683213) 3.1600657703918664e-06


In [39]:
# Machine learning

In [40]:
# Get the null accuracy
y.value_counts(normalize=True)

1    0.500552
0    0.499448
Name: Sentiment, dtype: float64

In [41]:
# For faster predictions with large number of features
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [42]:
# Set our pipeline parameters
pipe_params = {'vect__ngram_range':[(1, 1), (1, 2)], 'vect__max_features':[1000, 10000], 'vect__stop_words':[None, 'english']}

# Instantiate our pipeline
pipe = Pipeline([('vect', CountVectorizer()), ('classify', MultinomialNB())])

# Instantiate our gridsearch object
grid = GridSearchCV(pipe, pipe_params)
# Fit the gridsearch object
grid.fit(X, y)

# Get our results
print(grid.best_score_, grid.best_params_)

0.7647281282544413 {'vect__stop_words': None, 'vect__ngram_range': (1, 2), 'vect__max_features': 10000}


### FeatureUnion()

In [None]:
# We can build a featurizer that runs both a TfidfVectorizer and a CountVectorizer on the tweets 
# and concatenates them horizontally (keeping the same number of rows but increasing the number of columns).

In [43]:
from sklearn.pipeline import FeatureUnion

In [44]:
# Build a separate featurizer object
featurizer = FeatureUnion([('tfidf_vect', TfidfVectorizer()), ('count_vect', CountVectorizer())])

_ = featurizer.fit_transform(X)
print(_.shape)
# Same number of rows , but twice as many columns as either CV or TFIDF

(1578612, 1366426)


In [45]:
featurizer.set_params(tfidf_vect__max_features=100, 
                      count_vect__ngram_range=(1, 2), 
                      count_vect__max_features=300)
# The TfidfVectorizer will only keep 100 words while the CountVectorizer will keep 300 of 1 and 2 word phrases
_ = featurizer.fit_transform(X)
print(_.shape)  
# Same number of rows, but twice as many columns as either CV or TFIDF

(1578612, 400)


In [46]:
pipe_params = {'featurizer__count_vect__ngram_range':[(1, 1), (1, 2)], 
               'featurizer__count_vect__max_features':[1000, 10000], 
               'featurizer__count_vect__stop_words':[None, 'english'],
               'featurizer__tfidf_vect__ngram_range':[(1, 1), (1, 2)], 
               'featurizer__tfidf_vect__max_features':[1000, 10000], 
               'featurizer__tfidf_vect__stop_words':[None, 'english']}

pipe = Pipeline([('featurizer', featurizer), ('classify', MultinomialNB())])

grid = GridSearchCV(pipe, pipe_params)
grid.fit(X, y)

print(grid.best_score_, grid.best_params_)

0.771662701157726 {'featurizer__tfidf_vect__ngram_range': (1, 2), 'featurizer__count_vect__ngram_range': (1, 1), 'featurizer__count_vect__max_features': 10000, 'featurizer__tfidf_vect__max_features': 10000, 'featurizer__count_vect__stop_words': None, 'featurizer__tfidf_vect__stop_words': None}
