## Set up Environment 

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature selection classes
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# metrics
from sklearn import metrics



## Load data and generate TDM

In [2]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer()

X_vec = vectorizer.fit_transform(X)   #transform training data into TDM


### Examine the features generated

In [3]:
ftr_names= vectorizer.get_feature_names()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])


Number of features: 14023
First 100: ['00', '000', '000th', '002', '005', '007', '01', '011', '013', '013939', '014', '015', '016', '019', '02', '020', '021', '023', '024', '025', '027', '029', '03', '036', '037', '038', '039', '04', '040', '041', '042', '043', '0435', '044', '0458', '0483', '05', '050', '051', '052', '053', '055', '056', '059', '06', '065', '0666', '067', '069', '07', '070', '071', '072', '075', '077', '079', '08', '083', '086', '087', '088', '089', '08903', '09', '091', '094', '095', '097', '099', '10', '100', '1000', '1001', '1003', '1004', '1005', '1006', '1007', '1008', '101', '1010', '1012', '1013', '1014', '1015', '1016', '1017', '1018', '1019', '102', '1020', '1021', '1022', '1023', '1024', '1025', '1026', '1027', '1028', '10280']


### Remove Stopwords  and add in Document Frequency reduction

In [4]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=3, stop_words="english")
X_vec = vectorizer.fit_transform(X)   #transform training data

ftr_names= vectorizer.get_feature_names()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])



Number of features: 4779
First 100: ['00', '000', '01', '02', '03', '038', '04', '05', '06', '0666', '07', '08', '09', '091', '10', '100', '1000', '101', '1019', '102', '1020', '1029', '103', '1036', '1038', '104', '1046', '105', '106', '1061', '107', '1073', '108', '109', '10th', '11', '110', '111', '112', '113', '114', '1145', '115', '1157', '116', '117', '118', '1186', '119', '11th', '12', '120', '121', '122', '1223', '123', '124', '125', '126', '1262', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '1400', '141', '142', '143', '1430', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '15th', '16']


## Use the Chi-Squared test for feature selection

In [5]:
chi = SelectKBest(chi2, k=100)    #get top k features 
X_chi_vec= chi.fit_transform(X_vec, Y)  #fit and transform  training data (TDM) into the reduced feature space

mask = chi.get_support(indices=True) # mask returns a list of indices into the original vocabulary/feature space

mask #print out the mask

array([ 575,  579,  663,  730,  808,  814,  835,  836,  847,  852,  992,
       1023, 1073, 1088, 1089, 1103, 1206, 1240, 1257, 1423, 1426, 1551,
       1555, 1607, 1612, 1747, 1758, 1767, 1900, 1918, 1931, 1967, 2035,
       2044, 2049, 2055, 2056, 2060, 2161, 2174, 2230, 2233, 2240, 2289,
       2316, 2416, 2426, 2452, 2499, 2538, 2625, 2651, 2682, 2725, 2799,
       2827, 2899, 2978, 2985, 3054, 3072, 3087, 3190, 3278, 3280, 3292,
       3310, 3337, 3338, 3339, 3340, 3342, 3344, 3364, 3365, 3416, 3497,
       3502, 3525, 3552, 3566, 3604, 3737, 3744, 3760, 3765, 3781, 3785,
       3941, 4078, 4109, 4110, 4124, 4419, 4469, 4543, 4672, 4673, 4676,
       4734])

In [6]:
## access the mask
for i in mask[:10]:
    print("index: %d, feature name: %s" % (i, ftr_names[i]))



index: 575, feature name: aaa
index: 579, feature name: abc
index: 663, feature name: alomar
index: 730, feature name: arena
index: 808, feature name: baerga
index: 814, feature name: ball
index: 835, feature name: base
index: 836, feature name: baseball
index: 847, feature name: bat
index: 852, feature name: batting


In [7]:
## create a list of the selected features using the mask
new_ftrs = [] # a list to hold your k best features

for i in mask:
      new_ftrs.append(ftr_names[i])
print("Number of features: %d"  % len(new_ftrs))
print("First 100: %s" % new_ftrs[:100])        
        

Number of features: 100
First 100: ['aaa', 'abc', 'alomar', 'arena', 'baerga', 'ball', 'base', 'baseball', 'bat', 'batting', 'braves', 'bruins', 'calgary', 'canada', 'canadian', 'captain', 'chop', 'clemens', 'coach', 'cubs', 'cup', 'detroit', 'devils', 'dl', 'dodgers', 'era', 'espn', 'european', 'finals', 'flames', 'flyers', 'francis', 'giants', 'gilmour', 'gld', 'gm', 'goal', 'goals', 'hartford', 'hawks', 'hit', 'hitter', 'hockey', 'hr', 'ice', 'islanders', 'jagr', 'jewish', 'keenan', 'kings', 'leafs', 'lemieux', 'lindros', 'lopez', 'manager', 'mask', 'mets', 'montreal', 'morris', 'nhl', 'nl', 'north', 'ottawa', 'penguins', 'pens', 'period', 'phillies', 'pitch', 'pitched', 'pitcher', 'pitchers', 'pitching', 'pittsburgh', 'playoff', 'playoffs', 'pp', 'pts', 'puck', 'quebec', 'rangers', 'rbi', 'reds', 'rockies', 'roger', 'rotation', 'round', 'run', 'runs', 'sharks', 'sox', 'stadium', 'staff', 'stanley', 'traded', 'tv', 'vancouver', 'wings', 'winner', 'winnipeg', 'yankees']


## Perform classification using chi-squared feature selection

In [8]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer()

X_vec = vectorizer.fit_transform(X)   #transform training data

fs = SelectKBest(chi2, k=100)    #get top k features 
X_fs_vec= fs.fit_transform(X_vec, Y)  # fit and transform tdm to reduced feature space

newsgroups_test = fetch_20newsgroups(subset='test',     # get test data
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True,
                                     random_state=42)

vectors_test = vectorizer.transform(newsgroups_test.data)   #transform test data
fs_test = fs.transform(vectors_test)     # transform test data to reduced feature space

classifier = MultinomialNB(alpha=.01)
classifier.fit(X_fs_vec, Y)
predicted = classifier.predict(fs_test)

print(metrics.classification_report(newsgroups_test.target, predicted,
    target_names=newsgroups_train.target_names))

                    precision    recall  f1-score   support

rec.sport.baseball       0.97      0.66      0.79       397
  rec.sport.hockey       0.75      0.98      0.85       399

       avg / total       0.86      0.82      0.82       796



##  Look at some of the performance variables available

In [9]:
print("Accuracy = %6.4f" % metrics.accuracy_score(newsgroups_test.target, predicted))

print("Avg recall, micro = %6.4f" % metrics.recall_score(newsgroups_test.target, 
                                                         predicted, 
                                                         average='micro'))  # same as accuracy
      
print("Avg recall, macro = %6.4f" % metrics.recall_score(newsgroups_test.target, 
                                                         predicted, average='macro'))  # average of class recall

print("Avg precision, macro = %6.4f" % metrics.precision_score(newsgroups_test.target, 
                                                        predicted, average='macro'))  #average across class precision 


Accuracy = 0.8241
Avg recall, micro = 0.8241
Avg recall, macro = 0.8237
Avg precision, macro = 0.8604


## Task 1: Investigate the impact of class imbalance on performance measures

Build a classifier that predicts the categories 'talk.religion.misc' and 'soc.religion.christian' from the 20-newsgroups dataset.  
Test it first on the training set.
Test it also on the test set.  

Add a markdown cell and include your observations on the following:
* the class distribution of the training set  (evident from the classification report of testing on the training set -  support = the number of instances of each class)
* the performance on the training set versus the performance on the test set and what does any difference here mean
* which performance measure should be used.

Note that a Jupyter notebooks Markdown Cheatsheet is available at https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed


## Task 2: Investigate the impact of stopword removal and DF reduction on performance

Build a classifier on at least 3 categories of the 20-newsgroup dataset.  Measure the performance including stopword removal and various levels of document frequency reduction.

Add a markdown cell and outline your results showing the number of features used by the different settings and the impact on performance, if any.      

Justify your choice of performance measure.


## Task 3: Investigate using different feature selection techniques

Build a classifier on at least 3 categories of the 20-newsgroups dataset.  Select the top 100 features with and without both chi-squared feature selection.  Measure the performance on the test set.  

In a markdown cell include your observations on the following:
* can a reduced feature set meet the performance of the classifier on the full feature set?
* how many features would you select to use on this dataset?
