# Problem 5

In [1]:
# code source from http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#
# code source from http://www.nltk.org/book/ch02.html
# code source from http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html

from sklearn.datasets import fetch_20newsgroups

import nltk
from nltk.corpus import stopwords
import string

# extracts the archive contents in the ~/scikit_learn_data/20news_home folder 
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

################# this part is for pre-processing the text file which isn't mandatory
# get rid of the stopwords and punctuation
nltk.download('stopwords')
stop_word =stopwords.words('english')
punctuation = string.punctuation
stopw_punctuation = list(stop_word) + list(punctuation)

# from nltk.tokenize import word_tokenize

# # to tokenize the word
# for j in range(len(newsgroups_train.data)):
#     newsgroups_train.data[j] = " ".join([w for w in word_tokenize(newsgroups_train.data[j]) if w not in stopw_punctuation])
#    # print(newsgroups_train.data[])
# for j in range(len(newsgroups_test.data)):
#     newsgroups_test.data[j] = " ".join([w for w in word_tokenize(newsgroups_train.data[j]) if w not in stopw_punctuation])

#print(len(newsgroups_train.data))
#print(newsgroups_train.data[1])

##############

# from http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english')
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
# the test data set don't need to fit
vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english', vocabulary = vectorizer.vocabulary_)
vectors_test = vectorizer.fit_transform(newsgroups_test.data)
print(vectors_train.shape)
print(vectors_test.shape)


news_train = vectors_train
news_test = vectors_test



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deshenghu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
(11314, 129791)
(7532, 129791)


In [30]:
## example from link: http://scikit-learn.org/stable/modules/feature_selection.html
'''
>>> from sklearn.svm import LinearSVC
>>> from sklearn.datasets import load_iris
>>> from sklearn.feature_selection import SelectFromModel
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>> X.shape
(150, 4)
>>> lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
>>> model = SelectFromModel(lsvc, prefit=True)
>>> X_new = model.transform(X)
>>> X_new.shape
(150, 3)
'''
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_selection import SelectFromModel as SFM
# link: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# Linear Model trained with L1 prior as regularizer (aka the Lasso)
## 20 NG news dataset
news_train_label = newsgroups_train.target
Lasso = LR(C = 0.3, penalty='l1',).fit(news_train, news_train_label)
model = SFM(Lasso, prefit = True)
X_train = model.transform(news_train)
X_test = model.transform(news_test)

news_test_label = newsgroups_test.target
LR_lasso = LR().fit(X_train, news_train_label)

accuracy_news_train_lasso = LR_lasso.score(X_train, news_train_label)
accuracy_news_test_lasso = LR_lasso.score(X_test, news_test_label)

print("20NG News Training dataset, Accuracy of LR over L1 prior as regularizer(i.e., Lasso) is {}".format(accuracy_news_train_lasso))
print("20NG News Test dataset, Accuracy of LR over L1 prior as regularizer(i.e., Lasso) is {}".format(accuracy_news_test_lasso))



20NG News Training dataset, Accuracy of LR over L1 prior as regularizer(i.e., Lasso) is 0.750132579105533
20NG News Test dataset, Accuracy of LR over L1 prior as regularizer(i.e., Lasso) is 0.676048858204992


# Problem 6 

In [77]:
import numpy as np
# source: http://scikit-learn.org/stable/datasets/index.html
from sklearn.datasets import fetch_mldata

mnist_data = fetch_mldata('MNIST original')['data']

#get the labels of the MNIST dataset 
mnist_label = fetch_mldata('MNIST original')['target']
print(mnist_data.shape)

## link:https://docs.python.org/3/library/random.html#random.randint
from random import randrange

'''
here is to generate a list of rectangles and each side of each rectangle is least 5 and at most 23
'''
rectangle_list = []
for _ in range(130):
    ## rectangle is constrained to have approx 130-170 area, thus each side should be at least 5 and at most 23.
    A = randrange(0,23)
    B = randrange(0,23)
    C = randrange(a+5,28)
    D = randrange(b+5,28)
    rectangle_list.append([A,B,C,D])

'''
here function "get_harr_features" is to get a list of HARR_feature correponding to the input data set
'''
def get_harr_features(a, rectangle_list):
    
    black = np.zeros((28, 28, 28, 28))
    ## here is to calculate the sum of a area covering the two diagonal points
    for i in range(28):
        black[0, 0, 0, i] = sum(a[0, :i])
        black[0, 0, i, 0] = sum(a[:i, 0])
        
    ## here is to calculate the O-cornered rectangles by using dynamic programming, e.g., 
    '''
    for i=rows
    for j=columns
    black(rectangle-diag(ODij)) = black(rectangle-diag(ODi,j-1)) + black(rectangle-diag(ODi-1,j)) 
                                - black(rectangle-diag(ODi-1,j-1)) + black(pixel Dij)
    '''
    for i in range(1,28):
        for j in range(1, 28):
            black[0, 0, i, j] = black[0, 0, i-1, j] + black[0,0,i,j-1] - black[0,0,i-1,j-1] + a[i,j]
            
    HARR_feature_list = []
    '''
    Since all rectangles cornered at O have their black computed and stored, the procedure for general rectangles is to:
    black(rectangle ABCD) = black(OTYD) - black(OTXB) - black(OZYC) + black(OZXA)
    '''
    for a in rectangle_list:
        ## here is to calculate the feature value of the four rectangle areas corresponding to the vertical/horizonal views
        black[a[0], a[1], (a[0]+a[2])//2, a[3]]=black[0,0,(a[0]+a[2])//2,a[3]]-black[0,0,(a[0]+a[2])//2,a[1]]-black[0,0,a[3],a[0]]+ black[0,0,a[0],a[1]]
        black[(a[0]+a[2])//2, a[1], a[2], a[3]]=black[0,0,a[2],a[3]]-black[0,0,a[2],a[1]]-black[0,0,a[3],(a[0]+a[2])//2]+black[0,0,(a[0]+a[2])//2, a[1]]
        black[a[0],a[1],a[2],(a[1]+a[3])//2]=black[0,0,a[2],(a[1]+a[3])//2]-black[0,0,a[2],a[1]]-black[0,0,(a[1]+a[3])//2,a[0]]+black[0,0,a[0],a[1]]
        black[a[0],(a[1]+a[3])//2,a[2],a[3]]=black[0,0,a[2],a[3]]-black[0,0,a[2],(a[1]+a[3])//2]-black[0,0,a[3],a[0]]+black[0,0,a[0],(a[1]+a[3])//2]
        ## the black vertical difference: black(top-half) - black(bottom-half)
        vertic = black[a[0],a[1],(a[0]+a[2])//2, a[3]] - black[(a[0]+a[2])//2,a[1],a[2],a[3]]
        ## the black horizontal difference: black(left-half) - black(right-half)
        horizon= black[a[0],a[1],a[2], (a[1]+a[3])//2] - black[a[0],(a[1]+a[3])//2,a[2],a[3]]
        ## append the black vertical difference into the HARR_feature_list
        HARR_feature_list.append(vertic)
        ## append the black horizontal difference into the HARR_feature_list
        HARR_feature_list.append(horizon)
        
    return HARR_feature_list

'''
here is to append the HARR features for the whole mnist dataset into a list named "mnist_harr_feature_list"
'''
mnist_harr_feature_list = []
for i in range(0,mnist_data.shape[0],1):
    mnist_harr_feature_list.append(get_harr_features(mnist_data[i,:].reshape((28,28)),rectangle_list))
    
'''
here is to append the labels for the whole mnist dataset into a list 
'''
mnist_harr_label_list = []
for i in range(0,mnist_data.shape[0],1):
    mnist_harr_label_list.append(mnist_label[i])


(70000, 784)


In [78]:
# For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
# link: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
LR_harr = LR(penalty='l2', solver = 'sag')
LR_harr.fit(mnist_harr_feature_list, mnist_harr_label_list)
accuracy_mnist_harr= LR_harr.score(mnist_harr_feature_list, mnist_harr_label_list)

print("MNIST dataset, Accuracy of LR over HARR feature Extraction is {}".format(accuracy_mnist_harr))




MNIST dataset, Accuracy of LR over HARR feature Extraction is 0.9117857142857143
