In [1]:
import numpy as np
import pandas as pd
import RAKE
import matplotlib.pyplot as plt
import seaborn as sns
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

In [2]:
data = pd.read_excel('data.xlsx',index=False)

In [3]:
data.head(2)

Unnamed: 0,company_name,url,category
0,Advent International Corp.,www.adventinternational.com,VC
1,Crescent Real Estate Holdings LLC,www.crescent.com,VC


In [4]:
from sklearn.utils import shuffle
data = shuffle(data,random_state=0)

In [5]:
data.head(2)

Unnamed: 0,company_name,url,category
18,The Argentum Group (New York),www.argentumgroup.com,VC
169,"IT Group, Inc.",www.theitgroup.com,Non VC


In [6]:
new = data['url'].str.split(".",n=2,expand = True)
data['url_name'] = new[1]

In [7]:
data.head(2)

Unnamed: 0,company_name,url,category,url_name
18,The Argentum Group (New York),www.argentumgroup.com,VC,argentumgroup
169,"IT Group, Inc.",www.theitgroup.com,Non VC,theitgroup


In [8]:
data = data.drop(['url','url_name'],axis=1)

In [9]:
data.head()

Unnamed: 0,company_name,category
18,The Argentum Group (New York),VC
169,"IT Group, Inc.",Non VC
106,"Alpine Group, Inc.",Non VC
92,Pacific Private Capital LLC,VC
176,"Kettle Restaurants, Inc.",Non VC


In [10]:
data_set = data

In [11]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [12]:
data['company_name'] =data['company_name'].apply(lambda x:pre_process(x))
docs_title = data['company_name'].tolist()
docs_category = data['category'].tolist()

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("resource/stopwords.txt")

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs_title)

  'stop_words.' % sorted(inconsistent))


In [14]:
word_count_vector.shape

(199, 338)

In [15]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [16]:
# put the common code into several methods
feature_names=cv.get_feature_names()

from sklearn.feature_extraction.text import TfidfTransformer
def get_keywords(idx):

    #generate tf-idf for the given document
    #vectorizer.fit(data_to_vectorize)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_title[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n=====Title=====")
    print(docs_title[idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [17]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


=====Title=====
alcatel lucent holdings inc 

===Keywords===
lucent 0.601
alcatel 0.601
holdings 0.527


# Generate keywords for a batch of documents

In [18]:
#generate tf-idf for all documents in your list. docs_test has 500 documents
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_title))

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    results.append(keywords)

In [19]:
df=pd.DataFrame( list(zip(docs_category,docs_title,results)),columns=['category','doc','keywords'])
df.head()

Unnamed: 0,category,doc,keywords
0,VC,the argentum group new york,"{'york': 0.55, 'argentum': 0.55, 'new': 0.51, ..."
1,Non VC,it group inc,{'group': 1.0}
2,Non VC,alpine group inc,"{'alpine': 0.832, 'group': 0.554}"
3,VC,pacific private capital llc,"{'private': 0.629, 'pacific': 0.629, 'capital'..."
4,Non VC,kettle restaurants inc,"{'restaurants': 0.707, 'kettle': 0.707}"


# Implementing the same using RAKE

In [20]:
data_set.head()

Unnamed: 0,company_name,category
18,the argentum group new york,VC
169,it group inc,Non VC
106,alpine group inc,Non VC
92,pacific private capital llc,VC
176,kettle restaurants inc,Non VC


In [21]:
# initializing the new column
data_set['Key_words'] = ""

for index, row in data_set.iterrows():
    plot = row['company_name']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())



In [22]:
data_set.head()

Unnamed: 0,company_name,category,Key_words
18,the argentum group new york,VC,"[argentum, group, new, york]"
169,it group inc,Non VC,"[group, inc]"
106,alpine group inc,Non VC,"[alpine, group, inc]"
92,pacific private capital llc,VC,"[pacific, private, capital, llc]"
176,kettle restaurants inc,Non VC,"[kettle, restaurants, inc]"


In [23]:
data_set.set_index('company_name', inplace = True)
data_set.head()

Unnamed: 0_level_0,category,Key_words
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1
the argentum group new york,VC,"[argentum, group, new, york]"
it group inc,Non VC,"[group, inc]"
alpine group inc,Non VC,"[alpine, group, inc]"
pacific private capital llc,VC,"[pacific, private, capital, llc]"
kettle restaurants inc,Non VC,"[kettle, restaurants, inc]"


# Recommendation of other companies

In [24]:
data_set['bag_of_words'] = ''
columns = data_set.columns
for index, row in data_set.iterrows():
    words = ''
    for col in columns:

        words = words + ' '.join(row[col])+ ' '

    row['bag_of_words'] = words
    

In [25]:
data_set.head()

Unnamed: 0_level_0,category,Key_words,bag_of_words
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
the argentum group new york,VC,"[argentum, group, new, york]",V C argentum group new york
it group inc,Non VC,"[group, inc]",N o n V C group inc
alpine group inc,Non VC,"[alpine, group, inc]",N o n V C alpine group inc
pacific private capital llc,VC,"[pacific, private, capital, llc]",V C pacific private capital llc
kettle restaurants inc,Non VC,"[kettle, restaurants, inc]",N o n V C kettle restaurants inc


In [26]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(data_set['bag_of_words'])

# creating a Series for the companies so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(data_set.index)
indices[:5]

0    the argentum group new york 
1                   it group inc 
2               alpine group inc 
3     pacific private capital llc
4         kettle restaurants inc 
Name: company_name, dtype: object

In [27]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.35355339, 0.28867513, ..., 0.        , 0.        ,
        0.        ],
       [0.35355339, 1.        , 0.81649658, ..., 0.40824829, 0.40824829,
        0.        ],
       [0.28867513, 0.81649658, 1.        , ..., 0.33333333, 0.33333333,
        0.        ],
       ...,
       [0.        , 0.40824829, 0.33333333, ..., 1.        , 0.33333333,
        0.        ],
       [0.        , 0.40824829, 0.33333333, ..., 0.33333333, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [28]:
# function that takes in company_name as input and returns the top 10 recommended companies
def recommendations(company_name, cosine_sim = cosine_sim):
    
    recommended = []
    
    # gettin the index of the company_name that matches the company_name
    idx = indices[indices == company_name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar companies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the companies of the best 10 matching companies
    for i in top_10_indexes:
        recommended.append(list(data_set.index)[i])
        
    return recommended

In [32]:
recommendations('the argentum group new york ')

['it group inc ',
 'charterhouse group inc ',
 'bloomberg industry group',
 'the stephens group llc',
 'the cypress group llc',
 'canopy group inc ',
 'the catalyst group inc ',
 'alpine group inc ',
 'h w kaufman financial group inc ',
 'first medical group inc ']

# We can identify which company is VC or Non VC by running the model generated in "Venture_Capital.ipynb"