In [1]:
import warnings
from sklearn.datasets import fetch_20newsgroups

warnings.filterwarnings(action='ignore')
groups = fetch_20newsgroups()

In [2]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        lemmatized_list = [lemmatizer.lemmatize(word.lower()) for word in doc.split() 
                           if word.isalpha() and word not in all_names]
        cleaned_docs += [' '.join(lemmatized_list)]
        
    return cleaned_docs

cleaned_data = clean_text(groups.data)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cleaned_data,
                                                   groups.target,
                                                   test_size = 0.2,
                                                   random_state = 0)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

groups = fetch_20newsgroups()

tfidfv = TfidfVectorizer(max_features=1000, stop_words='english')
tfidfv_model = tfidfv.fit(X_train)

X_train_vec = tfidfv_model.transform(X_train)
X_test_vec = tfidfv_model.transform(X_test)

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm = SVC(kernel='linear', C=0.1, random_state=0)

svm.fit(X_train_vec, y_train)

prediction = svm.predict(X_test_vec)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.63      0.51      0.57        88
           1       0.59      0.63      0.61       128
           2       0.65      0.64      0.64       113
           3       0.58      0.53      0.55       128
           4       0.60      0.51      0.55       120
           5       0.72      0.59      0.65       120
           6       0.67      0.67      0.67       105
           7       0.77      0.63      0.69       117
           8       0.88      0.70      0.78       128
           9       0.74      0.84      0.79       119
          10       0.96      0.77      0.85       115
          11       0.97      0.68      0.80       130
          12       0.32      0.82      0.47       122
          13       0.59      0.80      0.68        99
          14       0.86      0.72      0.79       122
          15       0.63      0.83      0.71       127
          16       0.86      0.70      0.77       115
          17       0.96    

In [6]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':(0.1, 1)}
grid_search = GridSearchCV(svm, parameters, cv=3)

grid_search.fit(X_train_vec, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 1}
0.7068832173240525


In [7]:
svm_best = grid_search.best_estimator_
prediction = svm_best.predict(X_test_vec)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.65      0.74      0.69        88
           1       0.63      0.66      0.64       128
           2       0.65      0.74      0.69       113
           3       0.60      0.59      0.60       128
           4       0.58      0.57      0.57       120
           5       0.75      0.69      0.72       120
           6       0.75      0.72      0.73       105
           7       0.69      0.74      0.71       117
           8       0.84      0.77      0.80       128
           9       0.71      0.84      0.77       119
          10       0.90      0.82      0.86       115
          11       0.92      0.85      0.88       130
          12       0.61      0.71      0.66       122
          13       0.74      0.78      0.76        99
          14       0.79      0.81      0.80       122
          15       0.76      0.81      0.79       127
          16       0.86      0.78      0.82       115
          17       0.91    

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('svc', SVC(kernel='linear'))])

parameters_pipeline = {
    'tfidf__max_df': (0.25, 0.5),
    'tfidf__max_features' : (100, 1000),
    'svc__C' : (0.1, 1)
}

grid_search = GridSearchCV(pipeline, parameters_pipeline, cv=3)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [9]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svc__C': 1, 'tfidf__max_df': 0.5, 'tfidf__max_features': 1000}
0.7076566125290022


In [10]:
svm_best = grid_search.best_estimator_
prediction = svm_best.predict(X_test)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.65      0.74      0.69        88
           1       0.63      0.66      0.64       128
           2       0.65      0.74      0.69       113
           3       0.60      0.59      0.60       128
           4       0.58      0.57      0.57       120
           5       0.75      0.69      0.72       120
           6       0.75      0.72      0.73       105
           7       0.69      0.74      0.71       117
           8       0.84      0.77      0.80       128
           9       0.71      0.84      0.77       119
          10       0.90      0.82      0.86       115
          11       0.92      0.85      0.88       130
          12       0.61      0.71      0.66       122
          13       0.74      0.78      0.76        99
          14       0.79      0.81      0.80       122
          15       0.76      0.81      0.79       127
          16       0.86      0.78      0.82       115
          17       0.91    