# Package Import (Must Run)

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
import string
import re
from nltk.stem import PorterStemmer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from 

from sklearn.model_selection import KFold, GridSearchCV

# Constants and Helper Functions (Must Run)¶

In [13]:
#-----------------------------Constants-----------------------------------#
# Number of components in Dimension Reduction
k = 50


COMPUTER_TECHNOLOGY = 0

RECREATIONAL_ACTIVITY = 1

# index of the last computer technology data group in variable 'categories'
LAST_COMPUTER_TECHNOLOGY_INDEX = 3 


#-----------------------------Helper Functions----------------------------#

def tokenizer_stem(text):
    # Convert each punctuation character to empty character.
    punc_remove_map = text.maketrans('', '', string.punctuation)
    text_no_punc = text.translate(punc_remove_map)
    
    # remove non-ascii characters
    ascii_text_no_punc = "".join(ch for ch in text_no_punc if ord(ch) < 128) 
    
    # Convert words that share the same stem to the stem
    tokens = ascii_text_no_punc.split()
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(n) for n in tokens if not n.isdigit()]
    return stemmed_tokens



def get_target_values(target_array):
    target_values = []
    for n in range(target_array.shape[0]):
        if target_array[n] <= LAST_COMPUTER_TECHNOLOGY_INDEX:
            target_values.append(COMPUTER_TECHNOLOGY)
        else:
            target_values.append(RECREATIONAL_ACTIVITY)
    return np.array(target_values)


# Plotting ROC curve 
def plot_roc_curve(target_values, predicted_prob, roc_title):
    fig, ax = plt.subplots()
    fpr, tpr, thresholds = roc_curve(target_values, predicted_prob)
    area_under_roc = auc(fpr,tpr)
    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.2f' % area_under_roc)
    ax.plot([0,1], [0,1], 'r--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(roc_title)
    ax.legend(loc="lower right")



# Classification Preparation (Must Run)

In [14]:
# Extract datasets
categories = ['comp.graphics',
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',
              'comp.sys.mac.hardware',
              'rec.autos',
              'rec.motorcycles',
              'rec.sport.baseball',
              'rec.sport.hockey']

eight_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
eight_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# Set labels for binary classifiers
eight_train_target = get_target_values(eight_train.target)
eight_test_target = get_target_values(eight_test.target)

# Initialize term count vectorizers
count_vectorizer_min_df_3 = CountVectorizer(
    analyzer = 'word',
    stop_words = ENGLISH_STOP_WORDS,
    min_df = 3
)



train_count_min_df_3 = count_vectorizer_min_df_3.fit_transform(eight_train.data)
test_count_min_df_3 = count_vectorizer_min_df_3.transform(eight_test.data)


# TFIDF transformer for problem c-j
tfidf_transformer = TfidfTransformer()
# Compute tfidf Matrices
eight_train_tfidf_min_df_3 = tfidf_transformer.fit_transform(train_count_min_df_3)
eight_test_tfidf_min_df_3 = tfidf_transformer.transform(test_count_min_df_3)




# Problem 1

In [7]:
print("Dimensions of TF-IDF matrix: "+ str(eight_train_tfidf_min_df_3.shape))

Dimensions of TF-IDF matrix: (4732, 20297)


# Problem 2

In [15]:

kmeans = KMeans(n_clusters=2, random_state=42).fit(eight_train_tfidf_min_df_3)
confusionmatrix = confusion_matrix(eight_train_target, kmeans.labels_, labels = [COMPUTER_TECHNOLOGY, RECREATIONAL_ACTIVITY])
print(confusionmatrix)

[[2340    3]
 [1339 1050]]
