In [1]:
import pandas as pd
from string import punctuation
import nltk
nltk.download('tagsets')
from nltk.data import load
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/kaylanguyen/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaylanguyen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())
tag_list = get_tagsets()
print(tag_list)

['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']


In [6]:
def get_pos_occurrence_freq(data, tag_list):
    text_list = data.text
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        pos_tags = [j for i, j in pos_tag(word_tokenize(text_line))]
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index=True)
    feature_df.fillna(0, inplace=True)
    return feature_df

In [7]:
tag_list = get_tagsets()

In [8]:
data = pd.read_csv('/Users/kaylanguyen/Documents/TheNLPWorkshop/Chapter2/data.csv', header=0)
feature_df = get_pos_occurrence_freq(data, tag_list)
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,MD,VB,WRB,NNP,EX,NNS,SYM,CC,CD,POS
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
def add_punctuation_count(feature_df, data):
    feature_df['num_of_unique_punctuations'] = data['text'].\
    apply(lambda x: len(set(x).intersection(set(punctuation))))
    return feature_df

In [20]:
feature_df = add_punctuation_count(feature_df, data)
feature_df['num_of_unique_punctuations'].head()

0    0
1    0
2    1
3    1
4    0
Name: num_of_unique_punctuations, dtype: int64

In [21]:
def get_capitalized_word_count(feature_df, data):
    feature_df['number_of_capital_words'] = data['text'].\
    apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].isupper()]))
    return feature_df

In [22]:
feature_df = get_capitalized_word_count(feature_df, data)
feature_df['number_of_capital_words'].head()

0    1
1    1
2    1
3    1
4    1
Name: number_of_capital_words, dtype: int64

In [23]:
def get_small_word_count(feature_df, data):
    feature_df['number_of_small_words'] = data['text'].\
    apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].islower()]))
    return feature_df

In [24]:
feature_df = get_small_word_count(feature_df, data)
feature_df['number_of_small_words'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_small_words, dtype: int64

In [25]:
def get_number_of_alphabets(feature_df, data):
    feature_df['number_of_alphabets'] = data['text'].apply(lambda x: len([ch for ch in str(x) if ch.isalpha()]))
    return feature_df

In [26]:
feature_df = get_number_of_alphabets(feature_df, data)
feature_df['number_of_alphabets'].head()

0    19
1    18
2    28
3    14
4    13
Name: number_of_alphabets, dtype: int64

In [27]:
def get_number_of_digit_count(feature_df, data):
    feature_df['number_of_digits'] = data['text'].apply(lambda x: len([ch for ch in str(x) if ch.isdigit()]))
    return feature_df

In [28]:
feature_df = get_number_of_digit_count(feature_df, data)
feature_df['number_of_digits'].head()

0    0
1    0
2    0
3    0
4    0
Name: number_of_digits, dtype: int64

In [29]:
def get_number_of_words(feature_df, data):
    feature_df['number_of_words'] = data['text'].apply(lambda x: len(word_tokenize(str(x))))
    return feature_df
feature_df = get_number_of_words(feature_df, data)
feature_df['number_of_words'].head()

0    5
1    4
2    9
3    5
4    3
Name: number_of_words, dtype: int64

In [30]:
def get_number_of_whitespaces(feature_df, data):
    feature_df['number_of_white_spaces'] = data['text'].apply(lambda x: len([ch for ch in str(x) if ch.isspace()]))
    return feature_df
feature_df = get_number_of_whitespaces(feature_df, data)
feature_df['number_of_white_spaces'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_white_spaces, dtype: int64

In [31]:
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,CC,CD,POS,num_of_unique_punctuations,number_of_capital_words,number_of_small_words,number_of_alphabets,number_of_digits,number_of_words,number_of_white_spaces
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,1,4,19,0,5,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,1,3,18,0,4,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,1,7,28,0,9,7
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1,1,3,14,0,5,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0,1,2,13,0,3,2
