In [1]:
%pylab inline
%precision 6

Populating the interactive namespace from numpy and matplotlib


'%.6f'

In [2]:
%load_ext autoreload
%autoreload 1
%aimport common

In [3]:
import pandas as pd
import sklearn as skl
import sklearn

In [4]:
# download common from https://github.com/Apogentus/common and add destination to it to PythonPath system variable
from common.serialization import pickle_load, pickle_save  
from common.classes.Struct import Struct
from common.feature_transformations import get_one_hot_encoding
from common.functions import all_nums
from common.visualize.colors import COLORS
from common.visualize.distributions import *

In [5]:
import scipy

In [6]:
pd.options.display.max_colwidth=100
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
rcParams['figure.figsize'] = (8.0, 5.0)

In [7]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [8]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [9]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [10]:
len(twenty_train.data), len(twenty_train.filenames)

(2257, 2257)

In [11]:
# clear headers until 'Lines: xxx' to get pure text contents
import copy, re
texts = copy.deepcopy(twenty_train.data)

for doc_num in range(len(texts)):
    m=re.search('Lines: \d+\s+', texts[doc_num])
    if m:  # if match found 
        i = m.span(0)[1]
        texts[doc_num] = texts[doc_num][i:]

In [12]:
del twenty_train

#### Word counts

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english',min_df=5)  # wrod should appear at least 5 times in the training set
X_train_counts = count_vect.fit_transform(texts)
X_train_counts.shape

(2257, 7859)

#### Word TF-IDFs

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X = tf_transformer.transform(X_train_counts)
X.shape

(2257, 7859)

In [29]:
words2nums = count_vect.vocabulary_ 

In [30]:
nums2words = dict((num,word) for word,num in words2nums.items())

In [31]:
words2nums, nums2words

({'does': 2331,
  'know': 4033,
  'good': 3206,
  'way': 7671,
  'standard': 6725,
  'pc': 5207,
  'application': 672,
  'pd': 5210,
  'utility': 7460,
  'convert': 1773,
  'tga': 7082,
  'files': 2887,
  'iii': 3547,
  'format': 2998,
  'like': 4186,
  'converting': 1777,
  'hp': 3492,
  'email': 2512,
  'response': 6051,
  'correct': 1814,
  'group': 3269,
  'thanks': 7085,
  'advance': 451,
  'michael': 4556,
  'programmer': 5581,
  'computer': 1618,
  'unit': 7386,
  'uk': 7340,
  'ac': 346,
  'city': 1424,
  'university': 7395,
  'tel': 7032,
  '8000': 271,
  'london': 4265,
  'fax': 2835,
  'hi': 3417,
  'problem': 5548,
  'hope': 3469,
  'help': 3399,
  'solve': 6604,
  'background': 876,
  'rectangular': 5856,
  'mesh': 4530,
  'domain': 2342,
  'mapping': 4383,
  '3d': 176,
  'bezier': 989,
  'patch': 5185,
  '2d': 146,
  'area': 704,
  'inside': 3722,
  'loop': 4274,
  'rendered': 5966,
  'set': 6412,
  'curve': 1938,
  'segments': 6358,
  'sake': 6225,
  'cells': 1307,
  'sp

In [32]:
U,S,VT = scipy.sparse.linalg.svds(X, k=50)

In [33]:
X.shape # [documents x words]

(2257, 7859)

In [34]:
U.shape

(2257, 50)

In [35]:
S.shape

(50,)

In [36]:
VT.shape

(50, 7859)

### Extracting most close words

In [37]:
word_num = words2nums['doctor'] # try also: christian doctor treatment printer anxiety

dists = sum( (VT-VT[:, word_num][:,newaxis])**2, 0)

dists

inds = argsort(dists)

dists[inds]

nearest_inds = inds[1:11]

[nums2words[ind] for ind in nearest_inds]

['pain',
 'treatment',
 'patient',
 'patients',
 'told',
 'taking',
 'clinic',
 'doctors',
 'medicine',
 'days']

### Extracting most close documents

In [41]:
doc_num = 0

dists = sum( (U-U[doc_num,:][newaxis,:])**2, 1)
inds = argsort(dists)
nearest_inds = inds[1:6]

In [42]:
print('ORIGINAL DOCUMENT:\n\n%s\n\n\n'%texts[doc_num][:600])

ORIGINAL DOCUMENT:

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.






In [43]:
for num, ind in enumerate(nearest_inds,start=1):
    print('%s-TH MOST CLOSE DOCUMENT:\n\n%s\n\n\n'%(num, texts[ind][:600]) )

1-TH MOST CLOSE DOCUMENT:

Does anybody know where I can get, via anonymous ftp or otherwise, a PostScript
driver for the graphics libraries GINO verison 3.0A ?

We are runnining on a VAX/VMS and are looking for a way outputing our plots to a
PostScript file...


Thanks in advance...
-- 
Koon Tang,                                internet: ktt3@unix.bton.ac.uk
Department of Mathematical Sciences,          uucp: uknet!itri!ktt3
University of Brighton,
Brighton,
BN2 4GJ,
U.K.




2-TH MOST CLOSE DOCUMENT:

Are any readers of s.r.c. going to the Love Europe congress in Germany this
July?
-- 
Michael Davis (cs89mcd@brunel.ac.uk)




3-TH MOST CLOSE DOCUMENT:

From: stjohn@math1.kaist.ac.kr (Ryou Seong Joon)
Subject: WANTED: Multi-page GIF!!
Organization: Korea Advanced Institute of Science and Technology
X-Newsreader: Tin 1.1 PL3
Lines:       12

Hi!... 

I am searching for packages that could handle Multi-page GIF
files...    

Are there any on some ftp servers?

I'll appreciate one which 