In [495]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from tabulate import tabulate

In [332]:
df = pd.read_pickle('ufo_db.pkl')

In [329]:
def split_entered(word):
    #word.split('Entered as: ', 1)[1]
    #word.split('Occurred :',1)[1]
    return word.split('(Entered as : ', 1)[0]
   
def split_occurred(word):
    return word.split('Occurred :')[1]

def split_reported(word): 
    return word.split("Reported: ")[1]

def split_location(word): 
    return word.split('Location:')[1]

def split_shape(word): 
    try: 
        return word.split('Shape:')[1]
    except IndexError: 
        return word

def split_duration(word): 
    try: 
        return word.split('Duration:')[1]
    except IndexError: 
        return word
    
def split_posted(word): 
    try: 
        return word.split('Posted:')[1]
    except IndexError: 
        return word

In [333]:
df[0] = df[0].apply(split_occurred)
df[0] = df[0].apply(split_entered)
df[1] = df[1].apply(split_reported)
df[2] = df[2].apply(split_posted)
df[3] = df[3].apply(split_location)
df[4] = df[4].apply(split_shape)
df[5] = df[5].apply(split_duration)

In [334]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,5/6/2017 05:00,5/6/2017 4:10:01 AM 04:10,5/6/2017,"Camp McGregor, NM",Light,10 minutes,Light seen over mountain's east of Camp McGreg...
1,5/6/2017 04:50,5/6/2017 5:00:54 AM 05:00,5/6/2017,"Mojave (Canada), BC",Light,1:00,Light in sky stationary. Not a airplane or an...
2,5/5/2017 11:30,5/5/2017 12:18:44 PM 12:18,5/6/2017,"Austin, TX",Disk,3 seconds,"Flying saucer descends, possibly lands in Nort..."
3,5/5/2017 03:00,5/5/2017 3:49:05 AM 03:49,5/6/2017,"El Mirage, AZ",Circle,30 seconds,"While letting my dog out, a very bright white ..."
4,5/4/2017 23:34,5/4/2017 10:38:52 PM 22:38,5/6/2017,"York, NE",Fireball,0:18,A fire ball was moving in the atmosphere while...


In [414]:
descriptions = df[6]

In [415]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [416]:
types = np.array([type(i) for i in descriptions])


In [417]:
df_array = np.array(df)

In [419]:
remove_floats = types == float

In [425]:
rows_to_remove = descriptions[remove_floats]

In [431]:
rows_to_remove.index.values

array([13698, 39641])

In [432]:
descriptions.drop(rows_to_remove.index.values)

0         Light seen over mountain's east of Camp McGreg...
1         Light in sky stationary.  Not a airplane or an...
2         Flying saucer descends, possibly lands in Nort...
3         While letting my dog out, a very bright white ...
4         A fire ball was moving in the atmosphere while...
5         craft following above car I saw a light reflec...
6         helicopter chasing orange light at low altitud...
7         Orange round sphere. Orange glowing sphere fle...
8         Was standing outside in downtown houston at 8:...
9         Flying corkscrews Looking to th east at about ...
10        Hat-shaped tan object caught on cell phone cam...
11        I'm a truck driver and I've seen the reddish/o...
12        For a little over a month, I would go outside ...
13        Watched a 3-4 in sphere come in one end of dev...
14        Fast moving green lights.  One flashing the ot...
15        I was walking my dog and I looked up at the st...
16        Three moving bright lights. Th

In [435]:
tfidf = vectorizer.fit_transform(descriptions.drop(rows_to_remove.index.values))

In [487]:
sklearn_nmf = NMF(n_components=10, init='random', random_state=0)

In [488]:
sklearn_nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=0, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [452]:
names = np.array(vectorizer.get_feature_names())

In [499]:
topic_list = []
rows = []
for idx, row in enumerate(sklearn_nmf.components_): 
    top_idx = np.argsort(row)[-5:]
    topic_list.append(names[top_idx])
    rows.append('Topic {}'.format(idx+1))

In [500]:
topic_df = pd.DataFrame(topic_list, columns = ['Word 1', 'Word 2', 'Word 3', 'Word 4', 'Word 5'], index=rows )


In [501]:
print(tabulate(topic_df.round(), headers='keys', tablefmt='pipe'))

|          | Word 1      | Word 2      | Word 3    | Word 4   | Word 5   |
|:---------|:------------|:------------|:----------|:---------|:---------|
| Topic 1  | went        | looked      | like      | just     | saw      |
| Topic 2  | approximate | pd          | note      | nuforc   | date     |
| Topic 3  | orbs        | disappeared | fireball  | glowing  | orange   |
| Topic 4  | remain      | information | anonymous | elects   | provides |
| Topic 5  | formation   | flashing    | triangle  | red      | lights   |
| Topic 6  | south       | west        | north     | east     | objects  |
| Topic 7  | green       | red         | white     | bright   | light    |
| Topic 8  | white       | shape       | appeared  | shaped   | object   |
| Topic 9  | triangular  | flying      | triangle  | shaped   | craft    |
| Topic 10 | stars       | like        | moving    | sky      | star     |
