# Examining Tokens

In [1]:
from data import data_frames as dfs
import pandas as pd

In [2]:
dfs.all_df_names

['ashville_lst',
 'ashville_rvs',
 'austin_lst',
 'austin_rvs',
 'broward_lst',
 'broward_rvs',
 'clark_lst',
 'clark_rvs',
 'nashville_lst',
 'nashville_rvs',
 'ottawa_lst',
 'ottawa_rvs',
 'salem_lst',
 'salem_rvs']

In [3]:
dfs.all_dfs[1].shape

(173892, 6)

In [4]:
df_stats = {}
for i, df in enumerate(dfs.all_df_names):
    df_stats.update({df: [dfs.all_dfs[i].shape[0], 
                          dfs.all_dfs[i].shape[1]] 
                    })    
df_stats = pd.DataFrame(df_stats).T
df_stats = df_stats.rename(columns={0: 'n_rows', 1:'n_columns'})
print(df_stats.shape)

(14, 2)


In [5]:
df_stats.sort_values('n_rows', ascending=False)

Unnamed: 0,n_rows,n_columns
nashville_rvs,322322,6
austin_rvs,321484,6
clark_rvs,267537,6
broward_rvs,215455,6
ashville_rvs,173892,6
ottawa_rvs,100764,6
austin_lst,10450,74
broward_lst,10071,74
clark_lst,9156,74
salem_rvs,7994,6


In [6]:
# Get Only Reviews Dataframes
rvs_names = [df for df in dfs.all_df_names if 'rvs' in df]
rvs_names

['ashville_rvs',
 'austin_rvs',
 'broward_rvs',
 'clark_rvs',
 'nashville_rvs',
 'ottawa_rvs',
 'salem_rvs']

In [7]:
dir(dfs)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'all_df_names',
 'all_dfs',
 'ashville_lst',
 'ashville_rvs',
 'austin_lst',
 'austin_rvs',
 'broward_lst',
 'broward_rvs',
 'clark_lst',
 'clark_rvs',
 'nashville_lst',
 'nashville_rvs',
 'ottawa_lst',
 'ottawa_rvs',
 'pd',
 'rvs_dfs',
 'rvs_names',
 'salem_lst',
 'salem_rvs']

In [8]:
rvs_dfs_stats = {}
for i, df in enumerate(dfs.rvs_names):
    rvs_dfs_stats.update({df: dfs.rvs_dfs[i].shape[0]})    
rvs_dfs_stats = pd.DataFrame(pd.Series(rvs_dfs_stats))
rvs_dfs_stats = rvs_dfs_stats.rename(columns={0: 'n_rows'})
print(rvs_dfs_stats.shape)

(7, 1)


In [9]:
rvs_dfs_stats

Unnamed: 0,n_rows
ashville_rvs,173892
austin_rvs,321484
broward_rvs,215455
clark_rvs,267537
nashville_rvs,322322
ottawa_rvs,100764
salem_rvs,7994


In [10]:
dfs.rvs_dfs[0]['comments']

0         Lisa is superb hostess, she will treat you lik...
1         This was a lovely little place walking distanc...
2         Lisa was very nice to work with.  However, we ...
3         I feel very lucky to have found this beautiful...
4         Great roomy little apartment, beautiful privat...
                                ...                        
173887    Lisa was a great host & the location is very p...
173888    Lisa's place was great value for the area! It ...
173889    We were comfortable in this home and it is con...
173890    Great location, close to everything. Spacious ...
173891    We had a great time and we wish we could have ...
Name: comments, Length: 173892, dtype: object

In [11]:
dfs.rvs_dfs[0]['comments'][0]

'Lisa is superb hostess, she will treat you like family and provide you with the coziest little home in Asheville which will definitely enhance your experience of the magical town! Just like the Eco-retreat, the Private sunny apartment is a neat little flat with all you need for up to 3 people, the place was impeccable in lovely neighborhood. You can hardly beat this one!'

## Start Tokenization

In [12]:
import spacy

# load NN sm, md, lg etc.
nlp = spacy.load("en_core_web_lg")

In [13]:
doc = nlp(dfs.rvs_dfs[0]['comments'][0])

In [19]:
tokens = [token for token in doc]
tokens[:5]

[We, arrived, to, a, beautiful]

In [26]:
try_with_drops = dfs.rvs_dfs[0]['comments'].dropna()

In [None]:
# Why is this not working??
# Is there a float hidden in the column of text?
# The float might be caused by an np.nan
tokens = []
for doc in nlp.pipe(try_with_drops):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False) & (token.is_space == False):
                doc_tokens.append(token.text.lower())
            tokens.append(doc_tokens)

In [25]:
# Operation Find Float

dfs.rvs_dfs[0]['comments'].isna().sum()

54

In [29]:
master_tokens = []

for df in dfs.rvs_dfs:
    tokens = []
    for doc in nlp.pipe(df['comments'].dropna()):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False) & (token.is_space == False):
                doc_tokens.append(token.text.lower())
            tokens.append(doc_tokens)
    master_tokens.append(tokens)



In [30]:
print("Length", len(master_tokens))
master_tokens[0][0]

Length 7


['lisa',
 'superb',
 'hostess',
 'treat',
 'like',
 'family',
 'provide',
 'coziest',
 'little',
 'home',
 'asheville',
 'definitely',
 'enhance',
 'experience',
 'magical',
 'town',
 'like',
 'eco',
 'retreat',
 'private',
 'sunny',
 'apartment',
 'neat',
 'little',
 'flat',
 'need',
 '3',
 'people',
 'place',
 'impeccable',
 'lovely',
 'neighborhood',
 'hardly',
 'beat']

In [None]:
# Is tokenization the best normalization strategy?
# Is normalization the right word?