## Text Featurization
http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [2]:
import pandas as pd
import numpy as np
df = pd.read_pickle('project1/articles-cleaned.pkl')

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sb

# make sure we display all the columns
pd.set_option('display.max_columns', 25)

# let's split this into two tables: meta table and word table
meta_table = df[['item_id', 'resolved_id', 'is_archived', 'word_count', 
                 'actual_word_count', 'resolved_title', 'resolved_url']]

word_table = df[['item_id', 'article_text']]

# view the results of these transformations
(rows, cols) = meta_table.shape
print "Meta Rows: ", rows
print "Meta Cols: ", cols
print 
meta_table.tail(1)

Meta Rows:  653
Meta Cols:  7



Unnamed: 0,item_id,resolved_id,is_archived,word_count,actual_word_count,resolved_title,resolved_url
864,928712290,928712290,False,3688,3685,Fixing Engineering's Loyalty and Longevity Pro...,http://firstround.com/review/fixing-engineerin...


In [3]:
# view the results of these transformations
(rows, cols) = word_table.shape
print "Word Rows: ", rows
print "Word Cols: ", cols
print 
word_table.tail(1)

Word Rows:  653
Word Cols:  2



Unnamed: 0,item_id,article_text
864,928712290,first round reviewfixing engineerings loyalty ...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

css_keywords = set()
with open('css-keywords.txt', 'rb') as fp:
    for line in fp:
        css_keywords.add(line.strip())

my_stop_words = text.ENGLISH_STOP_WORDS.union(css_keywords)
vectorizers = [
    CountVectorizer(stop_words=my_stop_words, ngram_range=(1,3), analyzer='word', max_features=50, 
                    min_df=.10, max_df=.95, lowercase=True),
    TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1,3), analyzer='word', max_features=50, 
                    min_df=.10, max_df=.95, lowercase=True)
]

data_sets = []
for v in vectorizers:
    X = v.fit_transform(word_table.article_text).todense()
    print v.get_feature_names()
    print
    
    # create new data frames containing trainable data
    data = pd.DataFrame(X, columns=v.get_feature_names())
    data_sets.append(data)


IOError: [Errno 2] No such file or directory: 'css-keywords.txt'

In [28]:
viewable_data_sets = data_sets[:]
for ds in viewable_data_sets:
    ds['item_id'] = word_table.item_id

print "num_samples:  %d, num_features: %d" % viewable_data_sets[0].shape
viewable_data_sets[0].head()

num_samples:  653, num_features: 51


Unnamed: 0,app,article,best,better,build,change,code,company,day,developers,development,different,...,think,use,used,using,want,way,web,work,world,years,youre,item_id
0,0,0,1,1,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,6,2,0,909801869
1,54,1,5,8,1,1,8,0,1,1,1,4,...,5,12,3,6,17,17,5,4,2,5,3,1263607514
2,0,3,0,1,0,1,36,1,1,1,2,16,...,2,12,7,14,1,7,14,2,2,5,8,92392
3,0,0,0,0,0,1,0,0,12,0,0,1,...,3,2,1,2,4,2,0,4,1,0,7,14878635
4,5,1,0,0,2,0,3,0,0,0,7,0,...,2,4,0,0,1,1,5,1,4,0,4,1254271769


In [29]:
print "num_samples:  %d, num_features: %d" % viewable_data_sets[1].shape
viewable_data_sets[1].head()

num_samples:  653, num_features: 51


Unnamed: 0,app,article,best,better,build,change,code,company,day,developers,development,different,...,think,use,used,using,want,way,web,work,world,years,youre,item_id
0,0.0,0.0,0.099179,0.10038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2433,0.0,0.0,0.0,0.0,0.0,0.0,0.625853,0.229811,0.0,909801869
1,0.808171,0.015254,0.05779,0.093584,0.0129,0.013068,0.095889,0.0,0.013467,0.015587,0.014109,0.04985,...,0.062537,0.113414,0.036339,0.066361,0.1843,0.178329,0.070266,0.03965,0.024312,0.066954,0.038488,1263607514
2,0.0,0.067065,0.0,0.017144,0.0,0.019152,0.632383,0.020596,0.019737,0.022843,0.041354,0.292234,...,0.036661,0.166214,0.124266,0.226929,0.015888,0.107615,0.288338,0.029055,0.03563,0.098124,0.150418,92392
3,0.0,0.0,0.0,0.0,0.0,0.054521,0.0,0.0,0.674226,0.0,0.0,0.051994,...,0.156545,0.078861,0.050536,0.092287,0.180919,0.087529,0.0,0.165423,0.050715,0.0,0.374674,14878635
4,0.225085,0.045882,0.0,0.0,0.077606,0.0,0.10816,0.0,0.0,0.0,0.297066,0.0,...,0.075243,0.113713,0.0,0.0,0.032609,0.031553,0.211354,0.029816,0.146255,0.0,0.15436,1254271769


In [9]:
from sklearn.cluster import KMeans


# get all columns except for item_id
cols = [col for col in data.columns if col not in ['item_id']]
features = data[cols]

clustering_model = KMeans(n_clusters = 10)
clustering_model.fit(features)

clusters = clustering_model.predict(features)

clusters

array([1, 5, 3, 7, 4, 3, 6, 6, 7, 2, 5, 7, 1, 5, 5, 8, 6, 4, 9, 9, 8, 8, 1,
       8, 8, 0, 5, 9, 6, 6, 6, 7, 8, 7, 4, 0, 7, 3, 3, 0, 6, 3, 5, 4, 2, 0,
       3, 6, 6, 3, 5, 2, 6, 5, 6, 0, 0, 3, 7, 3, 7, 7, 0, 3, 8, 6, 0, 7, 6,
       9, 8, 6, 6, 7, 5, 0, 6, 3, 1, 8, 0, 0, 7, 7, 7, 3, 5, 8, 1, 5, 7, 0,
       5, 0, 0, 5, 3, 8, 3, 5, 5, 7, 8, 7, 5, 8, 9, 8, 0, 5, 3, 4, 0, 6, 6,
       8, 8, 3, 6, 8, 9, 7, 7, 6, 7, 7, 6, 7, 5, 8, 7, 1, 6, 7, 2, 7, 7, 7,
       0, 6, 3, 7, 8, 4, 7, 8, 3, 2, 3, 6, 8, 2, 6, 9, 2, 2, 8, 9, 0, 2, 0,
       0, 1, 6, 6, 7, 8, 1, 2, 8, 8, 6, 9, 7, 2, 3, 6, 3, 6, 7, 0, 3, 6, 9,
       0, 5, 5, 0, 3, 3, 2, 6, 7, 6, 2, 7, 6, 6, 0, 6, 2, 6, 6, 7, 7, 6, 1,
       2, 1, 5, 0, 8, 0, 7, 5, 1, 7, 6, 6, 7, 7, 0, 7, 3, 4, 1, 0, 2, 5, 6,
       2, 3, 6, 3, 0, 9, 6, 4, 5, 1, 8, 5, 8, 7, 8, 7, 9, 7, 3, 5, 6, 8, 5,
       8, 4, 0, 7, 9, 1, 2, 6, 1, 8, 8, 4, 0, 0, 2, 9, 6, 0, 8, 5, 4, 7, 0,
       5, 3, 4, 5, 4, 0, 8, 3, 6, 7, 2, 7, 6, 8, 5, 6, 7, 9, 5, 8, 6, 6, 1,
       8, 7,

In [6]:
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans

# # get all columns except for item_id
# cols = [col for col in data.columns if col not in ['item_id']]
# features = data[cols]

# pca = PCA(n_components=2).fit(features)
# data2D = pca.transform(features)

# for k in xrange(2, 10):
#     kmeans = KMeans(n_clusters=5).fit(features)
#     centers2D = pca.transform(kmeans.cluster_centers_)

#     plt.scatter(data2D[:,0], data2D[:,1], c=kmeans.labels_)
#     plt.hold(True)
#     plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3, c='r')


In [192]:
# group clusters and item_ids
clustering = kmeans.predict(features)
cluster_table = pd.DataFrame(clustering, columns=['cluster_id'])
cluster_table['item_id'] = data['item_id']

merged = cluster_table.merge(meta_table, on='item_id')

merged.head()


Unnamed: 0,cluster_id,item_id,resolved_id,is_archived,word_count,actual_word_count,resolved_title,resolved_url
0,0,909801869,909801869,True,505,940,Nepal Earthquake,http://time.com/3838716/earthquake-risk-nepal/...
1,2,1263607514,1263607514,False,6046,6318,Bots won't replace apps. Better apps will repl...,http://dangrover.com/blog/2016/04/20/bots-wont...
2,4,92392,92392,True,3669,3685,The Absolute Minimum Every Software Developer ...,http://www.joelonsoftware.com/articles/Unicode...
3,1,14878635,14878635,True,1128,1134,"Maker's Schedule, Manager's Schedule",http://www.paulgraham.com/makersschedule.html
4,2,1254271769,1254271769,False,859,859,Why Javascript Development is Crazy,http://www.planningforaliens.com/blog/2016/04/...


In [193]:
merged.cluster_id.value_counts()

4    188
1    151
2     56
3     48
0     44
Name: cluster_id, dtype: int64

In [195]:
a = list(merged[merged.cluster_id == 3].resolved_url)
for url in a:
    print url

https://www.airpair.com/angularjs/posts/component-based-angularjs-directives
http://qz.com/528840/researchers-want-to-use-google-glass-to-help-autistic-people-see-emotions/
http://blog.david-andrzejewski.com/machine-learning/practical-machine-learning-tricks-from-the-kdd-2011-best-industry-paper/
http://www.breakoutcareers.com?ref=producthunt
https://medium.com/humans-create-software/how-do-you-judge-a-javascript-programmer-by-only-5-questions-f2abdf7dfd4a
http://bitpoetry.io/difference-between-url-uri-and-urn/
https://medium.com/@rchang/learning-how-to-build-a-web-application-c5499bd15c8f
http://thinkfaster.co/2015/02/so-you-want-to-be-a-developer-rockstar
https://github.com/johnpapa/angular-styleguide
http://www.w3.org/wiki/Open_Web_Platform
http://jyotiska.github.io/blog/posts/python_libraries.html
https://medium.com/cs183c-blitzscaling-class-collection/cs183c-session-8-eric-schmidt-56c29b247998#.z9ejyd1m8
http://qz.com/440453/11-things-ultra-productive-people-do-differently/
http:/