In [1]:
# necessary for when working with external scripts
%load_ext autoreload
%autoreload 2

In [2]:
# import library
import pandas as pd
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import plotly as py
import math
import PAMI
import umap
%matplotlib inline

# prepare dataset
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train', categories=categories, 
                                  shuffle=True, random_state=42)

In [3]:
twenty_train.data[0:2]

['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n',
 "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the probl

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data)

2257

In [6]:
len(twenty_train.filenames)

2257

In [7]:
# we can also print an example from the subset
# An example of what the subset contains
print("\n".join(twenty_train.data[0].split("\n")))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [8]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [9]:
twenty_train.target[0]

1

In [10]:
# category of first 10 documents
twenty_train.target[0:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [11]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [12]:
# exercise 1
# print out the text data for the first three samples in the dataset
for text in twenty_train.data[:3]:
    print(text)

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences
Lines: 28



	Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv

In [14]:
# converting to dataframe
import pandas as pd

# my functions
# import helpers.data_mining_helpers as dmh
# import helpers.text_analysis as ta

def format_rows(docs):
    """ format the text field and strip special characters """
    D = []
    for d in docs.data:
        temp_d = " ".join(d.split("\n")).strip('\n\t')
        D.append([temp_d])
    return D


In [16]:
X = pd.DataFrame.from_records(format_rows(twenty_train), columns=['text'])

In [18]:
len(X)

2257

In [20]:
X[0:2]

Unnamed: 0,text
0,From: sd345@city.ac.uk (Michael Collier) Subje...
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...


In [24]:
for t in X["text"][:2]:
    print(t)

From: sd345@city.ac.uk (Michael Collier) Subject: Converting images to HP LaserJet III? Nntp-Posting-Host: hampton Organization: The City University Lines: 14  Does anyone know of a good way (standard PC application/PD utility) to convert tif/img/tga files into LaserJet III format.  We would also like to do the same, converting to HPGL (HP plotter) files.  Please email any response.  Is this the correct group?  Thanks in advance.  Michael. --  Michael Collier (Programmer)                 The Computer Unit, Email: M.P.Collier@uk.ac.city                The City University, Tel: 071 477-8000 x3769                      London, Fax: 071 477-8565                            EC1V 0HB. 
From: ani@ms.uky.edu (Aniruddha B. Deglurkar) Subject: help: Splitting a trimming region along a mesh  Organization: University Of Kentucky, Dept. of Math Sciences Lines: 28    	Hi,  	I have a problem, I hope some of the 'gurus' can help me solve.  	Background of the problem: 	I have a rectangular mesh in the uv

In [32]:
# adding columns
X['category'] = twenty_train.target

In [34]:
def format_labels(target, docs):
    """ format the labels """
    return docs.target_names[target]

In [36]:
X['category_name'] = X.category.apply(lambda t: format_labels(t, twenty_train))

In [38]:
X[0:10]

Unnamed: 0,text,categoty,category,category_name
0,From: sd345@city.ac.uk (Michael Collier) Subje...,1,1,comp.graphics
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...,1,1,comp.graphics
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...,3,3,soc.religion.christian
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...,3,3,soc.religion.christian
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3,3,soc.religion.christian
5,From: vbv@lor.eeap.cwru.edu (Virgilio (Dean) B...,3,3,soc.religion.christian
6,From: jodfishe@silver.ucs.indiana.edu (joseph ...,3,3,soc.religion.christian
7,From: aldridge@netcom.com (Jacquelin Aldridge)...,2,2,sci.med
8,From: geb@cs.pitt.edu (Gordon Banks) Subject: ...,2,2,sci.med
9,From: libman@hsc.usc.edu (Marlena Libman) Subj...,2,2,sci.med


In [42]:
# familiarizing yourself with the Data
# a simple query
X[:10][["text","category_name"]]

Unnamed: 0,text,category_name
0,From: sd345@city.ac.uk (Michael Collier) Subje...,comp.graphics
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...,comp.graphics
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...,soc.religion.christian
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...,soc.religion.christian
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,soc.religion.christian
5,From: vbv@lor.eeap.cwru.edu (Virgilio (Dean) B...,soc.religion.christian
6,From: jodfishe@silver.ucs.indiana.edu (joseph ...,soc.religion.christian
7,From: aldridge@netcom.com (Jacquelin Aldridge)...,sci.med
8,From: geb@cs.pitt.edu (Gordon Banks) Subject: ...,sci.med
9,From: libman@hsc.usc.edu (Marlena Libman) Subj...,sci.med


In [44]:
X[-10:]

Unnamed: 0,text,categoty,category,category_name
2247,From: daniels@math.ufl.edu (TV's Big Dealer) S...,3,3,soc.religion.christian
2248,"From: ""danny hawrysio"" <danny.hawrysio@canrem....",1,1,comp.graphics
2249,From: shellgate!llo@uu4.psi.com (Larry L. Over...,3,3,soc.religion.christian
2250,From: ingles@engin.umich.edu (Ray Ingles) Subj...,0,0,alt.atheism
2251,From: Mark-Tarbell@suite.com Subject: Amniocen...,2,2,sci.med
2252,From: roos@Operoni.Helsinki.FI (Christophe Roo...,2,2,sci.med
2253,From: mhollowa@ic.sunysb.edu (Michael Holloway...,2,2,sci.med
2254,From: sasghm@theseus.unx.sas.com (Gary Merrill...,2,2,sci.med
2255,From: Dan Wallach <dwallach@cs.berkeley.edu> S...,2,2,sci.med
2256,From: dyer@spdcc.com (Steve Dyer) Subject: Re:...,2,2,sci.med


In [48]:
# showing the last 10
X.tail(10)

Unnamed: 0,text,categoty,category,category_name
2247,From: daniels@math.ufl.edu (TV's Big Dealer) S...,3,3,soc.religion.christian
2248,"From: ""danny hawrysio"" <danny.hawrysio@canrem....",1,1,comp.graphics
2249,From: shellgate!llo@uu4.psi.com (Larry L. Over...,3,3,soc.religion.christian
2250,From: ingles@engin.umich.edu (Ray Ingles) Subj...,0,0,alt.atheism
2251,From: Mark-Tarbell@suite.com Subject: Amniocen...,2,2,sci.med
2252,From: roos@Operoni.Helsinki.FI (Christophe Roo...,2,2,sci.med
2253,From: mhollowa@ic.sunysb.edu (Michael Holloway...,2,2,sci.med
2254,From: sasghm@theseus.unx.sas.com (Gary Merrill...,2,2,sci.med
2255,From: Dan Wallach <dwallach@cs.berkeley.edu> S...,2,2,sci.med
2256,From: dyer@spdcc.com (Steve Dyer) Subject: Re:...,2,2,sci.med


In [50]:
# showing the first 10
X.head(10)

Unnamed: 0,text,categoty,category,category_name
0,From: sd345@city.ac.uk (Michael Collier) Subje...,1,1,comp.graphics
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...,1,1,comp.graphics
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...,3,3,soc.religion.christian
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...,3,3,soc.religion.christian
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3,3,soc.religion.christian
5,From: vbv@lor.eeap.cwru.edu (Virgilio (Dean) B...,3,3,soc.religion.christian
6,From: jodfishe@silver.ucs.indiana.edu (joseph ...,3,3,soc.religion.christian
7,From: aldridge@netcom.com (Jacquelin Aldridge)...,2,2,sci.med
8,From: geb@cs.pitt.edu (Gordon Banks) Subject: ...,2,2,sci.med
9,From: libman@hsc.usc.edu (Marlena Libman) Subj...,2,2,sci.med


In [46]:
# using loc (by label)
X.loc[:10, 'text']

0     From: sd345@city.ac.uk (Michael Collier) Subje...
1     From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...
2     From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...
3     From: s0612596@let.rug.nl (M.M. Zwart) Subject...
4     From: stanly@grok11.columbiasc.ncr.com (stanly...
5     From: vbv@lor.eeap.cwru.edu (Virgilio (Dean) B...
6     From: jodfishe@silver.ucs.indiana.edu (joseph ...
7     From: aldridge@netcom.com (Jacquelin Aldridge)...
8     From: geb@cs.pitt.edu (Gordon Banks) Subject: ...
9     From: libman@hsc.usc.edu (Marlena Libman) Subj...
10    From: anasaz!karl@anasazi.com (Karl Dussik) Su...
Name: text, dtype: object

In [52]:
# using iloc (by position)
X.iloc[:10, 0]

0    From: sd345@city.ac.uk (Michael Collier) Subje...
1    From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...
2    From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...
3    From: s0612596@let.rug.nl (M.M. Zwart) Subject...
4    From: stanly@grok11.columbiasc.ncr.com (stanly...
5    From: vbv@lor.eeap.cwru.edu (Virgilio (Dean) B...
6    From: jodfishe@silver.ucs.indiana.edu (joseph ...
7    From: aldridge@netcom.com (Jacquelin Aldridge)...
8    From: geb@cs.pitt.edu (Gordon Banks) Subject: ...
9    From: libman@hsc.usc.edu (Marlena Libman) Subj...
Name: text, dtype: object

In [None]:
# exercise 2 (take home)
# Experiment with other querying techniques using pandas dataframes. Refer to their documentation for more information
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

In [64]:
# exercise 3
# try to fetch records belonging to the sci.med category, and query every 10th record. Only show the first 5 records.
print(X[X['category_name']=='sci.med'].iloc[::10][0:5])

                                                  text  categoty  category  \
7    From: aldridge@netcom.com (Jacquelin Aldridge)...         2         2   
49   From: jimj@contractor.EBay.Sun.COM (Jim Jones)...         2         2   
82   From: jason@ab20.larc.nasa.gov (Jason Austin) ...         2         2   
118  From: rogers@calamari.hi.com (Andrew Rogers) S...         2         2   
142  From: lady@uhunix.uhcc.Hawaii.Edu (Lee Lady) S...         2         2   

    category_name  
7         sci.med  
49        sci.med  
82        sci.med  
118       sci.med  
142       sci.med  


In [66]:
# data mining using pandas
# missing values

#check missing values
X.isnull()

Unnamed: 0,text,categoty,category,category_name
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
2252,False,False,False,False
2253,False,False,False,False
2254,False,False,False,False
2255,False,False,False,False


In [68]:
def check_missing_values(row):
    """ functions that check and verifies if there are missing values in dataframe """
    counter = 0
    for element in row:
        if element == True:
            counter+=1
    return ("The amoung of missing records is: ", counter)


In [72]:
X.isnull().apply(lambda x: check_missing_values(x))

Unnamed: 0,text,categoty,category,category_name
0,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:
1,0,0,0,0


In [74]:
# exercise 4
X.isnull().apply(lambda x:check_missing_values(x), axis=1)

0       (The amoung of missing records is: , 0)
1       (The amoung of missing records is: , 0)
2       (The amoung of missing records is: , 0)
3       (The amoung of missing records is: , 0)
4       (The amoung of missing records is: , 0)
                         ...                   
2252    (The amoung of missing records is: , 0)
2253    (The amoung of missing records is: , 0)
2254    (The amoung of missing records is: , 0)
2255    (The amoung of missing records is: , 0)
2256    (The amoung of missing records is: , 0)
Length: 2257, dtype: object

In [76]:
dummy_series = pd.Series(["dummy_record", 1], index=["text", "category"])

In [78]:
dummy_series

text        dummy_record
category               1
dtype: object

In [80]:
dummy_series.to_frame().T

Unnamed: 0,text,category
0,dummy_record,1


In [82]:
result_with_series = pd.concat([X, dummy_series.to_frame().T], ignore_index=True)

In [84]:
# check if the records was commited into result
len(result_with_series)

2258

In [86]:
result_with_series.isnull().apply(lambda x: check_missing_values(x))

Unnamed: 0,text,categoty,category,category_name
0,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:
1,0,1,0,1


In [88]:
# dummy record as dictionary format
dummy_dict = [{'text': 'dummy_record',
              'category':1
              }]

In [90]:
X = pd.concat([X, pd.DataFrame(dummy_dict)], ignore_index=True)

In [92]:
len(X)

2258

In [94]:
X.isnull().apply(lambda x: check_missing_values(x))

Unnamed: 0,text,categoty,category,category_name
0,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:
1,0,1,0,1


In [96]:
# drop the missing values
X.dropna(inplace=True)

In [98]:
X.isnull().apply(lambda x: check_missing_values(x))

Unnamed: 0,text,categoty,category,category_name
0,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:,The amoung of missing records is:
1,0,0,0,0


In [100]:
len(X)

2257

# exercise 5 (take home)
There is an old saying that goes, "The devil is in the details." When we are working with extremely large data, it's difficult to check records one by one (as we have been doing so far). And also, we don't even know what kind of missing values we are
facing. Thus, "debugging" skills get sharper as we spend more time solving bugs. Let's focus on a different method to check for missing values and the kinds of missing values you may encounter. It's not easy to check for missing values as you will find
out in a minut
e.
Please check the data and the process below, describe what you observe and why it happened.
: why .isnull() didn't work?

In [102]:
import numpy as np

NA_dict = [{ 'id': 'A', 'missing_example': np.nan},
           { 'id': 'B'},
           { 'id': 'C', 'missing_example': 'NaN'},
           { 'id': 'D', 'missing_example': 'None'},
           { 'id': 'E', 'missing_example': None},
           { 'id': 'F', 'missing_example': ''}]

NA_df = pd.DataFrame(NA_dict, columns = ['id', 'missing_example'])
NA_df

Unnamed: 0,id,missing_example
0,A,
1,B,
2,C,
3,D,
4,E,
5,F,


In [104]:
NA_df['missing_example'].isnull()

0     True
1     True
2    False
3    False
4     True
5    False
Name: missing_example, dtype: bool

# Answer here
Because when the values have been assigned into "", they are considered as some meaningful values to the system. 
Therefore, when applying isnull function, it returns False since the values are definded. 

In [None]:
# Dealing with duplicate data

