In [1]:
import os
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob

In [2]:
docs = 'tech_trends'

In [3]:
os.chdir(docs)
os.listdir()[:5]

['PS_Trends2014_CIOasVC.txt',
 'PS_Trends2014_CloudOrchestration.txt',
 'PS_Trends2014_CognitiveAnalytics.txt',
 'PS_Trends2014_Crowdsourcing.txt',
 'PS_Trends2014_DigitalEngage.txt']

In [4]:
doc_dict = {}
for doc_name in os.listdir():
    with open(doc_name, 'rb') as doc:
        doc_dict[doc_name] = [line.strip().decode('utf8') for line in doc.readlines()]

In [5]:
doc_dict = {doc_name: " ".join(doc_dict[doc_name]) for doc_name in doc_dict.keys()}

In [6]:
doc_df = pd.DataFrame.from_dict(doc_dict,orient='index').reset_index()
doc_df.columns = ['doc_name', 'text']
doc_df.head()

Unnamed: 0,doc_name,text
0,PS_Trends2014_CIOasVC.txt,﻿CIOs have historically focused on core delive...
1,PS_Trends2014_CloudOrchestration.txt,﻿ CLOUD adoption across the enterprise is a gr...
2,PS_Trends2014_CognitiveAnalytics.txt,"﻿FOR decades, companies have dealt with inform..."
3,PS_Trends2014_Crowdsourcing.txt,﻿ ENTERPRISE adoption of crowdsourcing can all...
4,PS_Trends2014_DigitalEngage.txt,﻿DIGITAL is at the heart of business— reshapin...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(doc_df['text'])

In [8]:
print(vectorizer.get_feature_names()[-100:])
X.shape

['world', 'world_s_', 'worlds', 'worldwide', 'worn', 'worried', 'worry', 'worse', 'worsen', 'worst', 'worth', 'wortham', 'worthwhile', 'would', 'wouldn', 'woven', 'wow', 'wowing', 'wp', 'wpd', 'wrap', 'wrapping', 'wrestling', 'wright', 'write', 'writer', 'writers', 'writes', 'writing', 'written', 'wrong', 'wrote', 'wsdl', 'wsj', 'wtf', 'wulf', 'wurster', 'wuxi', 'www', 'www2', 'wyatt', 'wycash', 'wylie', 'xaas', 'xbox', 'xlviii', 'xml', 'xoom', 'xowi', 'xprize', 'yahoo', 'yang', 'ye', 'yeah', 'year', 'year6', 'yearlong', 'years', 'yes', 'yesterday', 'yesteryear', 'yet', 'yield', 'yielded', 'yielding', 'yields', 'yili', 'yin', 'york', 'yoshikawa', 'you', 'young', 'younger', 'your', 'yourgenome', 'yourself', 'yourselves', 'youtube', 'yudkowsky', 'yun', 'yusuf', 'zachary', 'zb', 'zdnet', 'zealand', 'zero', 'zeroturnaround', 'zettabyte', 'zettabytes', 'zgorski', 'zheng', 'zhtml', 'zoey', 'zone', 'zoned', 'zones', 'zumwalt', 'zurich', 'åse', 'être']


(34, 13279)

In [9]:
vec_df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names(), index=doc_df['doc_name'])
vec_df.head()

Unnamed: 0_level_0,000,01,02,03,030998,031,04,05,05b5efa4,06,...,zheng,zhtml,zoey,zone,zoned,zones,zumwalt,zurich,åse,être
doc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PS_Trends2014_CIOasVC.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PS_Trends2014_CloudOrchestration.txt,0.0,0.003751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PS_Trends2014_CognitiveAnalytics.txt,0.0,0.003701,0.0,0.01735,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PS_Trends2014_Crowdsourcing.txt,0.036831,0.0,0.008689,0.0,0.0,0.006072,0.003707,0.007414,0.0,0.003892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PS_Trends2014_DigitalEngage.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.00444,0.00444,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X)

array([[1.        , 0.82666369, 0.82708548, ..., 0.81478691, 0.7976713 ,
        0.86835999],
       [0.82666369, 1.        , 0.81724957, ..., 0.78294481, 0.76110811,
        0.83141918],
       [0.82708548, 0.81724957, 1.        , ..., 0.8152465 , 0.79393042,
        0.83090087],
       ...,
       [0.81478691, 0.78294481, 0.8152465 , ..., 1.        , 0.89864413,
        0.9225698 ],
       [0.7976713 , 0.76110811, 0.79393042, ..., 0.89864413, 1.        ,
        0.90064656],
       [0.86835999, 0.83141918, 0.83090087, ..., 0.9225698 , 0.90064656,
        1.        ]])

In [11]:
search_phrase = input('Enter search phrase: ')
search_row = pd.DataFrame(vectorizer.transform([search_phrase]).toarray(), 
                                         columns=vectorizer.get_feature_names(), 
                                         index=['search_phrase'])
temp_df = pd.concat([search_row, vec_df])
cosine_similarity(temp_df)[0,:]
cos_df = pd.DataFrame({'doc_name': list(temp_df.index), 'cos_sim': cosine_similarity(temp_df)[0,:]})\
                        .sort_values(by='cos_sim', ascending=False)

cos_df[cos_df['doc_name'] != 'search_phrase'].head(10)

Enter search phrase: will artificial intelligence replace workers


Unnamed: 0,doc_name,cos_sim
33,Trends2018_No_Collar_Workforce.txt,0.090929
26,Trends2017_Machine_Intelligence.txt,0.076576
31,Trends2018_Exponentials.txt,0.05795
27,Trends2017_Mixed_Reality.txt,0.051796
6,PS_Trends2014_Exponentials.txt,0.043333
11,PS_Trends2014_Wearables.txt,0.02443
12,Trends2016_Augmented_Virtual_Reality.txt,0.024248
15,Trends2016_Exponentials.txt,0.023212
21,Trends2017_Dark_Analytics.txt,0.022474
23,Trends2017_Exponentials.txt,0.020491


# Top Doc

In [12]:
top_doc = list(cos_df[cos_df['doc_name'] != 'search_phrase']['doc_name'])[0]
doc_dict[top_doc]

'\ufeffTech Trends 2018 The symphonic enterprise COVER IMAGE BY: MARTIN SATI Deloitte Consulting LLP’s Technology Consulting practice is dedicated to helping our clients build tomorrow by solving today’s complex business problems involving strategy, procurement, design, delivery, and assurance of technology solutions. Our service areas include analytics and information management, delivery, cyber risk services, and technical strategy and architecture, as well as the spectrum of digital strategy, design, and development services offered by Deloitte Digital. Learn more about our Technology Consulting practice on www.deloitte.com. Tech Trends 2018: The symphonic enterprise  No-collar workforce Humans and machines in one loop— collaborating in roles and new talent models WITH intelligent automation marching steadily toward broader adoption, media coverage of this historic technology disruption is turning increasingly alarmist. “New study: Artificial intelligence is coming for your jobs, mi