# Import datasets

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset='train',
                               categories=categories,
                               random_state = 42)
data_test = fetch_20newsgroups(subset='test',
                              categories=categories,
                              random_state=42)

In [3]:
import nltk
nltk.download("names")
nltk.download("wordnet")

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\KIST\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIST\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Preprocessing

In [4]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        lemmatized_list = [lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word not in all_names]
        cleaned_docs.append(' '.join(lemmatized_list))
    return cleaned_docs

In [5]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target

cleaned_test = clean_text(data_test.data)
label_test = data_test.target

len(label_train),len(label_test)

(1177, 783)

In [6]:
from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [7]:
Counter(label_test)

Counter({1: 394, 0: 389})

# Feature Extraction

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_features= 8000)

term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [9]:
term_docs_train.shape

(1177, 8000)

In [10]:
term_docs_test.shape

(783, 8000)

# SVM Train

In [11]:
from sklearn.svm import SVC
svm = SVC(kernel="linear", C = 1.0, random_state=42)
svm.fit(term_docs_train, label_train)

SVC(kernel='linear', random_state=42)

In [12]:
accuracy = svm.score(term_docs_test, label_test)
print("acc on test set: {0:.1f}%".format(accuracy*100))

acc on test set: 95.9%
