In [136]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
import math
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [69]:
income_json = pd.read_json('Income.json')

In [70]:
income_json

Unnamed: 0,# of kids,Income,State
0,5,25000,CA
1,5,122500,NY
2,2,142007,TX
3,2,42007,TX
4,0,14704,TX
5,1,200704,TX
6,1,120070,CA
7,3,207040,NY
8,3,48000,NY
9,3,79000,NY


In [71]:
type(income_json)

pandas.core.frame.DataFrame

In [72]:
# lowercasing across the dataframe

income_json = income_json.apply(lambda x: x.astype(str).str.lower())

In [73]:
# and the columns

income_json.columns = map(str.lower, income_json.columns)

In [74]:
income_json

Unnamed: 0,# of kids,income,state
0,5,25000,ca
1,5,122500,ny
2,2,142007,tx
3,2,42007,tx
4,0,14704,tx
5,1,200704,tx
6,1,120070,ca
7,3,207040,ny
8,3,48000,ny
9,3,79000,ny


In [80]:
# removing punctuation, though this is redundant since the data contains
# no punctuation to begin with.

income_json = income_json.apply(lambda x: x.astype(str).replace('[^\w\s]',''))

In [81]:
# removing stop words

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tdrace\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [82]:
stop = stopwords.words('english')

In [84]:
# removing stop words. Since there are no stop words in the data (outside of
# the column names), this too is redundant, but here's how you can do it.

income_json['state'] = income_json['state'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [86]:
# Using PorterStemmer. Again, there is no true data to use this on, the state
# names are just abbrevations, but here's one way to do it.

ps = PorterStemmer() 

In [87]:
words = income_json['state']

In [88]:
for w in words: 
    print(w, " : ", ps.stem(w)) 

ca  :  ca
ny  :  ny
tx  :  tx
tx  :  tx
tx  :  tx
tx  :  tx
ca  :  ca
ny  :  ny
ny  :  ny
ny  :  ny


In [90]:
# Here's what it looks like for a sentence

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tdrace\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [91]:
sentence = "All blue brains doth right the righteous king"
words = word_tokenize(sentence)

In [92]:
for w in words: 
    print(w, " : ", ps.stem(w)) 

All  :  all
blue  :  blue
brains  :  brain
doth  :  doth
right  :  right
the  :  the
righteous  :  righteou
king  :  king


In [104]:
# Using TF-IDF on random text data.
# following tutorial here:
# https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8

text1 = "All blue brains doth right the righteous king. Here lies sad and blighted boy Rupert. Ring around the rosie and pockets full of posies, and so forth!"

In [107]:
# function to remove special characters

def remove_string_special_characters(s):
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)
    stripped = re.sub('\s+', ' ', stripped)
    stripped = stripped.strip()
    return stripped

In [97]:
# function to split the text into sentences, calculating word count of each.
# Each sentence is a document.

def get_doc(sent):
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1
        count = count_words(sent)
        temp = {'doc_id': i, 'doc_length': count}
        doc_info.append(temp)
    return doc_info

In [98]:
# function to return total number of words in the text

def count_words(sent):
    count = 0
    wirds = word_tokenize(sent)
    for word in words:
        count += 1
    return count

In [99]:
# function to create a frequency dictionary of each word in each document

def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
            temp = {'doc_id': i, 'freq_dict': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list

In [100]:
# function to compute the tf, frequency of a term in the document divided by
# the total number of terms in the document

def computeTF(doc_info, freqDict_list):
    TF_scores = []
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id': id,
                   'TF_score': tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                   'key': k}
            TF_scores.append(temp)
        return TF_scores

In [101]:
# function to compute idf

def computeIDF(doc_info, freqDict_list):
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id': counter, 'IDF_score': math.log(len(doc_info)/count), 'key': k}
            
            IDF_scores.append(temp)
            
        return IDF_scores

In [102]:
# function to compute TF-IDF

def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id': j['doc_id'],
                       'TFIDF_score': j['IDF_score']*i['TF_score'],
                       'key': i['key']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

In [105]:
text_sents = sent_tokenize(text1)

In [110]:
text_sents_clean = [remove_string_special_characters(s) for s in text_sents]

In [111]:
doc_info = get_doc(text_sents_clean)

In [112]:
freqDict_list = create_freq_dict(text_sents_clean)

In [113]:
TF_scores = computeTF(doc_info, freqDict_list)

In [114]:
IDF_scores = computeIDF(doc_info, freqDict_list)

In [115]:
doc_info

[{'doc_id': 1, 'doc_length': 10},
 {'doc_id': 2, 'doc_length': 10},
 {'doc_id': 3, 'doc_length': 10}]

In [116]:
freqDict_list

[{'doc_id': 1,
  'freq_dict': {'all': 1,
   'blue': 1,
   'brains': 1,
   'doth': 1,
   'right': 1,
   'the': 1,
   'righteous': 1,
   'king': 1}},
 {'doc_id': 2,
  'freq_dict': {'here': 1,
   'lies': 1,
   'sad': 1,
   'and': 1,
   'blighted': 1,
   'boy': 1,
   'rupert': 1}},
 {'doc_id': 3,
  'freq_dict': {'ring': 1,
   'around': 1,
   'the': 1,
   'rosie': 1,
   'and': 2,
   'pockets': 1,
   'full': 1,
   'of': 1,
   'posies': 1,
   'so': 1,
   'forth': 1}}]

In [117]:
TF_scores

[{'doc_id': 1, 'TF_score': 0.1, 'key': 'all'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'blue'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'brains'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'doth'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'right'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'the'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'righteous'},
 {'doc_id': 1, 'TF_score': 0.1, 'key': 'king'}]

In [118]:
IDF_scores

[{'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'all'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'blue'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'brains'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'doth'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'right'},
 {'doc_id': 1, 'IDF_score': 0.4054651081081644, 'key': 'the'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'righteous'},
 {'doc_id': 1, 'IDF_score': 1.0986122886681098, 'key': 'king'}]

In [119]:
TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)

In [120]:
TFIDF_scores

[{'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'all'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'blue'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'brains'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'doth'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'right'},
 {'doc_id': 1, 'TFIDF_score': 0.04054651081081644, 'key': 'the'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'righteous'},
 {'doc_id': 1, 'TFIDF_score': 0.10986122886681099, 'key': 'king'}]

In [122]:
# encoding dictionaries of features

# Create dictionary
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

# Create dictionary vectorizer
dictvectorizer = DictVectorizer(sparse=False)

# Convert dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)

# View feature matrix
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [123]:
# Get feature names
feature_names = dictvectorizer.get_feature_names()

# View feature names
feature_names

['Blue', 'Red', 'Yellow']

In [124]:
# Create dataframe from features
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [125]:
# Create word counts dictionaries for four documents
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

# Create list
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

# Convert list of word count dictionaries into feature matrix
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [126]:
# tokenizing text

# Create text
string = "The science of today is the technology of tomorrow"

# Tokenize words
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [127]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."

# Tokenize sentences
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

In [130]:
# tagging parts of speech

nltk.download('averaged_perceptron_tagger')

text_data = "Chris loved outdoor running"

# Use pre-trained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))

# Show parts of speech
text_tagged

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tdrace\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [131]:
# Filter words
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]

['Chris']

In [134]:
# Create text
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]

# Create list
tagged_tweets = []

# Tag each word and each tweet
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

# Use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [135]:
# Show feature names
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [138]:
nltk.download('brown')

# Get some text from the Brown Corpus, broken into sentences
sentences = brown.tagged_sents(categories='news')

# Split into 4000 sentences for training and 623 for testing
train = sentences[:4000]
test = sentences[4000:]

# Create backoff tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

# Show accuracy
trigram.evaluate(test)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\tdrace\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


0.8174734002697437