In [3]:
import multiprocessing
import time
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import math
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams

categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']

print("===== Due to Memory Constraints, Only Four Categories of News Group are used.")
print("===== categories : "+str(categories)+" ======")

print("===== Loading Partial Training Dataset =====")
twenty_train_partial = fetch_20newsgroups(subset='train', 
remove=('headers', 'footers'
#         , 'quotes'
       ), 
                                          categories=categories, shuffle=True, random_state=42)

print("===== Loading Partial Testing Dataset =====")
twenty_test_partial = fetch_20newsgroups(subset='test', 
remove=('headers', 'footers'
#         , 'quotes'
       ), 
                                         categories=categories, shuffle=True, random_state=42)

print("===== PreProcessing =====")

def remove_string_special_characters(s):
    stripped = re.sub('_+', '', s)
    stripped = re.sub('\s+', ' ', stripped)
    stripped = re.sub('\d', '', stripped)
    stripped = re.sub('/', ' ', stripped)
    stripped = re.sub('[^\w\s]', '', stripped)
    stripped = stripped.lower()
    return stripped

text_sents_clean = []

def main():
    q = multiprocessing.Queue()
    for i in range(int(len(twenty_train_partial.data)/10)):
        s=remove_string_special_characters(twenty_train_partial.data[i])
        text_sents_clean.append(s)
        q.put(i)
    time.sleep(0.1)

if __name__ == "__main__":
    main()

print(" ========================= ")
print(" twenty_train_partial.data size : "+str(len(twenty_train_partial.data)))
print(" text_sents_clean size : "+str(len(text_sents_clean)))
print("===== Features =====")

print("===== Bag of Words =====")
count_vect = CountVectorizer(analyzer='word', stop_words='english')
X_train_counts = count_vect.fit_transform(text_sents_clean)
list_bow = list(count_vect.vocabulary_.keys())[:]

with open('bag_of_words.txt', 'w') as f:
    for item in list_bow:
        f.write('%s\n' % item)
print("Written to bag_of_words.txt")

print("===== N-grams =====")

output = list()

for s in text_sents_clean:

    tokens = [token for token in s.split(" ") if token != ""]
    output += list(ngrams(tokens, 3))

with open('n-grams.txt', 'w') as f:
    for item in output:
        f.write('%s\n' % str(item))
print("Written to n-grams.txt")

print("===== TFIDF =====")

def get_doc(sent):
    doc_info = []
    i=0
    for sent in text_sents_clean:
        i += 1
        count=count_words(sent)
        temp={'doc_id' : i, 'doc_length' : count }
        doc_info.append(temp)
    return doc_info

def count_words(sent):
    count = 0
    words = word_tokenize(sent)
    for word in words:
        count += 1
    return count

def create_freq_dicts(sents):
    i=0
    freqDict_list = []
    for sent in sents:
        i+=1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
#             word = word.lower()
            if word in freq_dict:
                freq_dict[word]=+1
            else:
                freq_dict[word]=1
#             temp={'doc_id' : i, 'freq_dict' : freq_dict }
        temp={'doc_id' : i, 'freq_dict' : freq_dict }
        freqDict_list.append(temp)
    return freqDict_list

def computeTF(doc_info, freqDict_list):
    TF_scores=[]
    for tempDict in freqDict_list:
        id=tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp={'doc_id' : id, 
                  'TF_score' : tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'], 
                  'key' : k
                 }
            TF_scores.append(temp)
    return TF_scores

def computeIDF(doc_info, freqDict_list):
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id' : counter, 
                    'IDF_score' : math.log(len(doc_info)/count), 
                    'key' : k
                   }
            IDF_scores.append(temp)
    return IDF_scores

def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id' : i['doc_id'],
                        'TFIDF_score' : j['IDF_score']*i['TF_score'],
                        'key' : i['key']
                       }
                TFIDF_scores.append(temp)
    return TFIDF_scores

doc_info = get_doc(text_sents_clean)

freqDict_list = create_freq_dicts(text_sents_clean)

TF_Score = computeTF(doc_info, freqDict_list)

IDF_Score = computeIDF(doc_info, freqDict_list)

TFIDF_Score = computeTFIDF(TF_Score, IDF_Score)

with open('tfidf.txt', 'w') as f:
    for item in TFIDF_Score:
        f.write('%s\n' % str(item))
print("Written to tfidf.txt")

print("===== POS-tags =====")

pos_output=[]

for s in text_sents_clean:
    text = word_tokenize(s)
    pos_output.append(nltk.pos_tag(text))

with open('POS-tags.txt', 'w') as f:
    for item in pos_output:
        f.write('%s\n' % str(item))
print("Written to POS-tags.txt")

print("===== Lemmatization (Head Words) =====")

lemma_output = list()
wnl = nltk.WordNetLemmatizer()

for s in text_sents_clean:
    sentences = sent_tokenize(s)
    for r in sentences:
        words = word_tokenize(r)
        lemma_output += [ wnl.lemmatize(token.lower()) for token in words ]

with open('HeadWords.txt', 'w') as f:
    for item in lemma_output:
        f.write('%s\n' % str(item))
print("Written to HeadWords.txt")


print(" ========================= ")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


===== Due to Memory Constraints, Only Four Categories of News Group are used.
===== Loading Partial Training Dataset =====
===== Loading Partial Testing Dataset =====
===== PreProcessing =====
 twenty_train_partial.data size : 2257
 text_sents_clean size : 225
===== Features =====
===== Bag of Words =====
Written to bag_of_words.txt
===== N-grams =====
Written to n-grams.txt
===== TFIDF =====
Written to tfidf.txt
===== POS-tags =====
Written to POS-tags.txt
===== Lemmatization (Head Words) =====
Written to HeadWords.txt


In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import metrics

# categories = ['alt.atheism', 'soc.religion.christian',
#                'comp.graphics', 'sci.med']

# print("===== Due to Memory Constraints, Only Four Categories of News Group are used.")
# print("===== categories : "+str(categories)+" ======")

print("===== Loading Training Dataset =====")
twenty_train_partial = fetch_20newsgroups(subset='train', 
remove=('headers', 'footers', 'quotes'), 
#                                           categories=categories, 
                                          shuffle=True, random_state=42)

print("===== Loading Testing Dataset =====")
twenty_test_partial = fetch_20newsgroups(subset='test', 
remove=('headers', 'footers', 'quotes'), 
#                                          categories=categories, 
                                         shuffle=True, random_state=42)

print(" ========================= ")
print("===== Classifiers =====")

print("===== Build a MultinomialNB Classifier with Tfidf as Feature =====")
text_clf_MultinomialNB = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

print("===== Train the MultinomialNB Classifier with training data =====")
text_clf_MultinomialNB.fit(twenty_train_partial.data, twenty_train_partial.target)

docs_test_MultinomialNB = twenty_test_partial.data
predicted_MultinomialNB = text_clf_MultinomialNB.predict(docs_test_MultinomialNB)

print("===== Accuracy of MultinomialNB Classifier =====")
print(np.mean(predicted_MultinomialNB == twenty_test_partial.target))

print("===== Classification Report of MultinomialNB Classifier =====")
print(metrics.classification_report(twenty_test_partial.target, predicted_MultinomialNB,
     target_names=twenty_test_partial.target_names))

# print("===== Confusion Matrix Report of MultinomialNB Classifier =====")
# print(metrics.confusion_matrix(twenty_test_partial.target, predicted_MultinomialNB))

print(" ========================= ")

print("===== Build a SGDClassifier with Tfidf as Feature =====")
text_clf_SGDClassifier = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

print("===== Train the SGDClassifier with training data =====")
text_clf_SGDClassifier.fit(twenty_train_partial.data, twenty_train_partial.target)

docs_test_SGDClassifier = twenty_test_partial.data
predicted_SGDClassifier = text_clf_SGDClassifier.predict(docs_test_SGDClassifier)

print("===== Accuracy of SGDClassifier =====")
print(np.mean(predicted_SGDClassifier == twenty_test_partial.target))

print("===== Classification Report of SGDClassifier =====")
print(metrics.classification_report(twenty_test_partial.target, predicted_SGDClassifier,
     target_names=twenty_test_partial.target_names))

# print("===== Confusion Matrix Report of MultinomialNB Classifier =====")
# print(metrics.confusion_matrix(twenty_test_partial.target, predicted_SGDClassifier))

print(" ========================= ")

print("===== Build a svm.SVC Classifier with Tfidf as Feature =====")
text_clf_svm_SVCClassifier = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', svm.SVC(kernel='linear')),
 ])

print("===== Train the svm.SVC Classifier with training data =====")
text_clf_svm_SVCClassifier.fit(twenty_train_partial.data, twenty_train_partial.target)

docs_test_svm_SVCClassifier = twenty_test_partial.data
predicted_svm_SVCClassifier = text_clf_svm_SVCClassifier.predict(docs_test_svm_SVCClassifier)

print("===== Accuracy of svm.SVC Classifier =====")
print(np.mean(predicted_svm_SVCClassifier == twenty_test_partial.target))

print("===== Classification Report of svm.SVC Classifier =====")
print(metrics.classification_report(twenty_test_partial.target, predicted_svm_SVCClassifier,
     target_names=twenty_test_partial.target_names))

# print("===== Confusion Matrix Report of svm_SVCClassifier Classifier =====")
# print(metrics.confusion_matrix(twenty_test_partial.target, predicted_svm_SVCClassifier))

print(" ========================= ")

===== Due to Memory Constraints, Only Four Categories of News Group are used.
===== Loading Partial Training Dataset =====
===== Loading Partial Testing Dataset =====
===== Classifiers =====
===== Build a MultinomialNB Classifier with Tfidf as Feature =====
===== Train the MultinomialNB Classifier with training data =====
===== Accuracy of MultinomialNB Classifier =====
0.6062134891131173
===== Classification Report of MultinomialNB Classifier =====
                          precision    recall  f1-score   support

             alt.atheism       0.81      0.07      0.13       319
           comp.graphics       0.72      0.62      0.67       389
 comp.os.ms-windows.misc       0.70      0.50      0.59       394
comp.sys.ibm.pc.hardware       0.55      0.75      0.64       392
   comp.sys.mac.hardware       0.81      0.61      0.69       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.86      0.69      0.77       390
               re



===== Accuracy of SGDClassifier =====
0.6836165693043016
===== Classification Report of SGDClassifier =====
                          precision    recall  f1-score   support

             alt.atheism       0.56      0.42      0.48       319
           comp.graphics       0.69      0.67      0.68       389
 comp.os.ms-windows.misc       0.67      0.60      0.63       394
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       392
   comp.sys.mac.hardware       0.76      0.68      0.72       385
          comp.windows.x       0.74      0.71      0.73       395
            misc.forsale       0.48      0.85      0.61       390
               rec.autos       0.79      0.70      0.74       396
         rec.motorcycles       0.73      0.77      0.75       398
      rec.sport.baseball       0.82      0.78      0.80       397
        rec.sport.hockey       0.82      0.91      0.86       399
               sci.crypt       0.71      0.74      0.73       396
         sci.electronics       0.

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']

print("===== Due to Memory Constraints, Only Four Categories of News Group are used.")
print("===== categories : "+str(categories)+" ======")

print("===== Loading Partial Dataset =====")
twenty_partial = fetch_20newsgroups(subset='all', 
remove=('headers', 'footers', 'quotes'), 
                                          categories=categories, shuffle=True, random_state=42)

print(" ========================= ")
print("===== 10-fold cross validation =====")

print("===== Perform 10-fold Cross Validation for SGDClassifier =====")

def convert_to_np(dataset):
    return np.asarray(dataset.data), dataset.target

x_train,y_train = convert_to_np(twenty_partial)

kf = KFold(n_splits=10)
curr_fold = 0
acc_list = []
folds = []
pred_list = []
true_list = []


from collections import Counter
print(Counter(y_train))

for train_idx, test_idx in kf.split(x_train):
    text_clf_KFold = Pipeline([('vect', CountVectorizer()),  # Counts occurrences of each word
                         ('tfidf', TfidfTransformer()),  # Normalize the counts based on document length
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',  # Call classifier with vector
                                               alpha=1e-3, random_state=42,
                                               max_iter=5, tol=None)),
                         ])
    
    text_clf_KFold.fit(x_train[train_idx].tolist(), y_train[train_idx])

    predicted__KFold = text_clf_KFold.predict(x_train[test_idx])
    
    print("===== Compute for fold_" + str(curr_fold) + " =====")
    
    acc = accuracy_score(y_train[test_idx].tolist(), predicted__KFold)
    acc_list.append(acc)
    print("accuracy : " + str(acc))
    
    prc = precision_score(y_train[test_idx].tolist(), predicted__KFold, average='weighted')
    print("precision : " + str(prc))
    
    rec = recall_score(y_train[test_idx].tolist(), predicted__KFold,  average='weighted')
    print("recall : " + str(rec))
    
    f1 = f1_score(y_train[test_idx].tolist(), predicted__KFold,  average='weighted')
    print("f1_score : " + str(f1))
    
    pred_list+=predicted__KFold.tolist()
    true_list+=y_train[test_idx].tolist()
    
    curr_fold += 1

print(" ========================= ")

# av_acc = accuracy_score(true_list, pred_list)
av_pre = precision_score(true_list, pred_list, average='weighted')
av_rec = recall_score(true_list, pred_list, average='weighted')

print("average accuracy" + " : " + str(np.average(acc_list)))
# print("average accuracy" + " : " + str(av_acc))
print("average precision" + " : " + str(av_pre))
print("average recall" + " : " + str(av_rec))
print("average f1 score" + " : " + str(f1_score(true_list, pred_list,  average='weighted')))

print(" ========================= ")


===== Due to Memory Constraints, Only Four Categories of News Group are used.
===== Loading Partial Dataset =====
===== 10-fold cross validation =====
===== Perform 10-fold Cross Validation for SGDClassifier =====
Counter({3: 997, 2: 990, 1: 973, 0: 799})




===== Compute for fold_0 =====
accuracy : 0.8351063829787234
precision : 0.840069435737129
recall : 0.8351063829787234
f1_score : 0.8312071110412786
===== Compute for fold_1 =====
accuracy : 0.8537234042553191
precision : 0.8601028409159083
recall : 0.8537234042553191
f1_score : 0.8494157764861999
===== Compute for fold_2 =====
accuracy : 0.8404255319148937
precision : 0.8491224467018588
recall : 0.8404255319148937
f1_score : 0.8339858955235389
===== Compute for fold_3 =====
accuracy : 0.8962765957446809
precision : 0.8964394033860396
recall : 0.8962765957446809
f1_score : 0.8941292199797463
===== Compute for fold_4 =====
accuracy : 0.848404255319149
precision : 0.8556365139283808
recall : 0.848404255319149
f1_score : 0.846165254879588
===== Compute for fold_5 =====
accuracy : 0.851063829787234
precision : 0.8517083435232738
recall : 0.851063829787234
f1_score : 0.846085910638123
===== Compute for fold_6 =====
accuracy : 0.8909574468085106
precision : 0.8979289671975622
recall : 0.8909