TASK 1

In [1]:
# ! pip install -U scikit-learn
# ! pip install pandas

In [2]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [3]:

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [4]:
twenty_train.target_names
print(len(twenty_train.data))
print(len(twenty_train.filenames))
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])
twenty_train.target[:10]



11314
11314
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
rec.autos


array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [5]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])


count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

rec.autos
comp.sys.mac.hardware
comp.sys.mac.hardware
comp.graphics
sci.space
talk.politics.guns
sci.med
comp.sys.ibm.pc.hardware
comp.os.ms-windows.misc
comp.sys.mac.hardware


(11314, 130107)

In [6]:
count_vect.vocabulary_.get(u'algorithm')

27366

In [7]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(11314, 130107)

In [8]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [9]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [10]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [11]:

text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
])

In [12]:
text_clf.fit(twenty_train.data, twenty_train.target)

In [13]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [14]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)

In [15]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8248805098247477

In [16]:

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.71      0.72       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.73      0.78      0.75       394
comp.sys.ibm.pc.hardware       0.74      0.67      0.70       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.84      0.76      0.80       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.91      0.90      0.90       396
         rec.motorcycles       0.93      0.96      0.95       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.88      0.99      0.93       399
               sci.crypt       0.84      0.96      0.90       396
         sci.electronics       0.83      0.62      0.71       393
                 sci.med       0.87      0.86      0.87       396
         

In [17]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[226,   0,   0,   1,   0,   2,   1,   0,   1,   3,   0,   2,   1,
         11,   5,  44,   2,   9,   1,  10],
       [  2, 280,  21,   8,   8,  24,   3,   1,   2,   4,   3,   9,   4,
          3,   8,   2,   2,   4,   0,   1],
       [  1,  11, 307,  21,  13,  12,   1,   0,   0,   6,   2,   7,   1,
          1,   7,   2,   0,   1,   0,   1],
       [  3,  10,  27, 264,  26,   3,  12,   4,   4,   2,   1,   4,  22,
          2,   4,   0,   1,   1,   1,   1],
       [  0,   5,   7,  22, 319,   1,   9,   0,   1,   4,   1,   3,   5,
          1,   1,   0,   2,   1,   3,   0],
       [  1,  32,  42,   0,   3, 299,   2,   0,   1,   1,   1,   2,   1,
          1,   7,   1,   1,   0,   0,   0],
       [  0,   2,   1,  13,   4,   0, 350,   7,   1,   1,   2,   1,   3,
          2,   2,   0,   1,   0,   0,   0],
       [  1,   1,   1,   2,   1,   0,  10, 355,   7,   2,   0,   0,  10,
          0,   2,   0,   3,   0,   1,   0],
       [  0,   0,   0,   1,   0,   0,   4,   6, 384,   2,   0,  

In [18]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [19]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [20]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [21]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [22]:
gs_clf.best_score_

0.6275

In [23]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


# TASK 2

In [24]:
#Naive Bayes Classifier + tfidf:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7738980350504514
Precision:0.8218781741893993
Recall: 0.7738980350504514
F1 Score: 0.7684457156894653


In [25]:
#SGD Classifier + tfidf:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8248805098247477
Precision:0.8280380138303584
Recall: 0.8248805098247477
F1 Score: 0.8190701989313807


In [26]:
#SVM Classifier + tfidf:
from sklearn import svm

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.SVC(kernel='linear')),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8347052575677111
Precision:0.8396041441415458
Recall: 0.8347052575677111
F1 Score: 0.8345147645688246


# TASK 3

In [27]:
#Naive Bias Classifier + count:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('clf', MultinomialNB()),
 ])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7728359001593202
Precision:0.7616683207318354
Recall: 0.7728359001593202
F1 Score: 0.7511127577441177


In [28]:
#Naive Bias Classifier + tf:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('clf', MultinomialNB()),
 ])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7052575677110993
Precision:0.78548015927203
Recall: 0.7052575677110993
F1 Score: 0.6920561650276855


In [29]:
#SGD Classifier + count:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7521242697822623
Precision:0.7635144469581859
Recall: 0.7521242697822623
F1 Score: 0.7523939346262882


In [30]:
#SGD Classifier + tf:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7697822623473181
Precision:0.7785283134394487
Recall: 0.7697822623473181
F1 Score: 0.762996798158262


In [31]:
#SVM Classifier + count:
from sklearn import svm

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', svm.SVC(kernel='linear')),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7336696760488582
Precision:0.7382348816871909
Recall: 0.7336696760488582
F1 Score: 0.7335972983749119


In [32]:
#SVM Classifier + tf:
from sklearn import svm

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', svm.SVC(kernel='linear')),
])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

accuracy = accuracy_score(twenty_test.target, predicted)
precision = precision_score(twenty_test.target, predicted, average='weighted')
recall = recall_score(twenty_test.target, predicted, average='weighted')
f1 = f1_score(twenty_test.target, predicted, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision:{precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7591609134360063
Precision:0.7647608383452609
Recall: 0.7591609134360063
F1 Score: 0.75800939744818


SVM Classifier + tfidf achieves the best accuracy

# Task 4

In [43]:
# lowercasing
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
svm_classifier = svm.SVC(kernel='linear')

# Define parameter sets
parameter_sets = [
    ('lowercasing', [True, False])
]
results = []

# Loop through parameter combinations
for param_name, param_values in parameter_sets:
    for param_value in param_values:
        vectorizer = CountVectorizer(lowercase=param_value)
        
        tfidf_transformer = TfidfTransformer()

        text_clf = Pipeline([
            ('vect', vectorizer),
            ('tfidf', tfidf_transformer),
            ('clf', svm_classifier),
        ])

        text_clf.fit(twenty_train.data, twenty_train.target)
        twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
        docs_test = twenty_test.data
        predicted = text_clf.predict(docs_test)

        # Calculate accuracy, recall, precision, and F1 score
        accuracy = accuracy_score(twenty_test.target, predicted)
        recall = recall_score(twenty_test.target, predicted, average='weighted')
        precision = precision_score(twenty_test.target, predicted, average='weighted')
        f1 = f1_score(twenty_test.target, predicted, average='weighted')

        result = {
            'Parameter': f"{param_name}={param_value}",
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'F1 Score': f1
        }
        results.append(result)

# Create a DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,Parameter,Accuracy,Recall,Precision,F1 Score
0,lowercasing=True,0.834705,0.834705,0.839604,0.834515
1,lowercasing=False,0.829129,0.829129,0.835501,0.829469


In [38]:
# stop_words
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
svm_classifier = svm.SVC(kernel='linear')
parameter_sets = [
    ('stop_words', [None, 'english'])
]

results = []

# Loop through parameter combinations
for param_name, param_values in parameter_sets:
    for param_value in param_values:
        vectorizer = CountVectorizer(stop_words=param_value)
        
        tfidf_transformer = TfidfTransformer()

        # Create a pipeline
        text_clf = Pipeline([
            ('vect', vectorizer),
            ('tfidf', tfidf_transformer),
            ('clf', svm_classifier),
        ])

        text_clf.fit(twenty_train.data, twenty_train.target)
        twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
        docs_test = twenty_test.data
        predicted = text_clf.predict(docs_test)

        # Calculate accuracy, recall, precision, and F1 score
        accuracy = accuracy_score(twenty_test.target, predicted)
        recall = recall_score(twenty_test.target, predicted, average='weighted')
        precision = precision_score(twenty_test.target, predicted, average='weighted')
        f1 = f1_score(twenty_test.target, predicted, average='weighted')

        # Store results in a dictionary
        result = {
            'Parameter': f"{param_name}={param_value}",
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'F1 Score': f1
        }
        results.append(result)

# Create a DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,Parameter,Accuracy,Recall,Precision,F1 Score
0,stop_words=None,0.834705,0.834705,0.839604,0.834515
1,stop_words=english,0.834971,0.834971,0.840497,0.834988


In [40]:
# analyzer + ngram_range
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

svm_classifier = svm.SVC(kernel='linear')
parameter_sets = [
    ('analyzer', ['word', 'char', 'char_wb']),
    ('ngram_range', [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)])
]

results = []

# Loop through parameter combinations
for param_name, param_values in parameter_sets:
    for param_value in param_values:
        if param_name == 'analyzer':
            vectorizer = CountVectorizer(analyzer=param_value)
        elif param_name == 'ngram_range':
            vectorizer = CountVectorizer(ngram_range=param_value)
        
        tfidf_transformer = TfidfTransformer()

        # Create a pipeline
        text_clf = Pipeline([
            ('vect', vectorizer),
            ('tfidf', tfidf_transformer),
            ('clf', svm_classifier),
        ])

        text_clf.fit(twenty_train.data, twenty_train.target)
        twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
        docs_test = twenty_test.data
        predicted = text_clf.predict(docs_test)

        # Calculate accuracy, recall, precision, and F1 score
        accuracy = accuracy_score(twenty_test.target, predicted)
        recall = recall_score(twenty_test.target, predicted, average='weighted')
        precision = precision_score(twenty_test.target, predicted, average='weighted', zero_division='warn')
        f1 = f1_score(twenty_test.target, predicted, average='weighted')

        # Store results in a dictionary
        result = {
            'Parameter': f"{param_name}={param_value}",
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'F1 Score': f1
        }
        results.append(result)

# Create a DataFrame
df = pd.DataFrame(results)
df

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Parameter,Accuracy,Recall,Precision,F1 Score
0,analyzer=word,0.834705,0.834705,0.839604,0.834515
1,analyzer=char,0.245751,0.245751,0.254562,0.223204
2,analyzer=char_wb,0.196362,0.196362,0.203998,0.161199
3,"ngram_range=(1, 1)",0.834705,0.834705,0.839604,0.834515
4,"ngram_range=(1, 2)",0.838555,0.838555,0.845291,0.83878
5,"ngram_range=(2, 2)",0.771907,0.771907,0.789605,0.77506
6,"ngram_range=(2, 3)",0.759161,0.759161,0.780599,0.762704
7,"ngram_range=(3, 3)",0.678837,0.678837,0.729252,0.690051


In [41]:
# analyzer + ngram_range
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

svm_classifier = svm.SVC(kernel='linear')
parameter_sets = [
    ('analyzer', ['word', 'char', 'char_wb']),
    ('ngram_range', [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)])
]

results = []

# Loop through parameter combinations
for analyzer_value in ['word', 'char', 'char_wb']:
    for ngram_value in [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)]:
        vectorizer = CountVectorizer(analyzer=analyzer_value, ngram_range=ngram_value)

        tfidf_transformer = TfidfTransformer()

        # Create a pipeline
        text_clf = Pipeline([
            ('vect', vectorizer),
            ('tfidf', tfidf_transformer),
            ('clf', svm_classifier),
        ])

        text_clf.fit(twenty_train.data, twenty_train.target)
        twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
        docs_test = twenty_test.data
        predicted = text_clf.predict(docs_test)

        # Calculate accuracy, recall, precision, and F1 score
        accuracy = accuracy_score(twenty_test.target, predicted)
        recall = recall_score(twenty_test.target, predicted, average='weighted')
        precision = precision_score(twenty_test.target, predicted, average='weighted', zero_division='warn')
        f1 = f1_score(twenty_test.target, predicted, average='weighted')

        # Store results in a dictionary
        result = {
            'Parameter': f"analyzer={analyzer_value}, ngram_range={ngram_value}",
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'F1 Score': f1
        }
        results.append(result)

# Create a DataFrame
df = pd.DataFrame(results)
df


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Parameter,Accuracy,Recall,Precision,F1 Score
0,"analyzer=word, ngram_range=(1, 1)",0.834705,0.834705,0.839604,0.834515
1,"analyzer=word, ngram_range=(1, 2)",0.838555,0.838555,0.845291,0.83878
2,"analyzer=word, ngram_range=(2, 2)",0.771907,0.771907,0.789605,0.77506
3,"analyzer=word, ngram_range=(2, 3)",0.759161,0.759161,0.780599,0.762704
4,"analyzer=word, ngram_range=(3, 3)",0.678837,0.678837,0.729252,0.690051
5,"analyzer=char, ngram_range=(1, 1)",0.245751,0.245751,0.254562,0.223204
6,"analyzer=char, ngram_range=(1, 2)",0.483935,0.483935,0.518863,0.477514
7,"analyzer=char, ngram_range=(2, 2)",0.617764,0.617764,0.630332,0.617491
8,"analyzer=char, ngram_range=(2, 3)",0.723845,0.723845,0.741471,0.726485
9,"analyzer=char, ngram_range=(3, 3)",0.739246,0.739246,0.757595,0.742628


In [42]:
# max_features
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

svm_classifier = svm.SVC(kernel='linear')
max_features_values = [10000, 200000, 5000000]

results = []

# Loop through max_features values
for max_features_value in max_features_values:
    vectorizer = CountVectorizer(max_features=max_features_value)

    tfidf_transformer = TfidfTransformer()

    # Create a pipeline
    text_clf = Pipeline([
        ('vect', vectorizer),
        ('tfidf', tfidf_transformer),
        ('clf', svm_classifier),
    ])

    text_clf.fit(twenty_train.data, twenty_train.target)
    twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
    docs_test = twenty_test.data
    predicted = text_clf.predict(docs_test)

    # Calculate accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(twenty_test.target, predicted)
    recall = recall_score(twenty_test.target, predicted, average='weighted')
    precision = precision_score(twenty_test.target, predicted, average='weighted', zero_division='warn')
    f1 = f1_score(twenty_test.target, predicted, average='weighted')

    # Store results in a dictionary
    result = {
        'Parameter': f"max_features={max_features_value}",
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1
    }
    results.append(result)

# Create a DataFrame
df = pd.DataFrame(results)
df


Unnamed: 0,Parameter,Accuracy,Recall,Precision,F1 Score
0,max_features=10000,0.812268,0.812268,0.817907,0.812612
1,max_features=200000,0.834705,0.834705,0.839604,0.834515
2,max_features=5000000,0.834705,0.834705,0.839604,0.834515


# Best parameteres

In [49]:
from sklearn.svm import SVC

# Define the best parameters
best_analyzer = 'word'
best_ngram_range = (1, 2)
best_stop_words = None  # Conflict when set to 'english'
best_lowercasing = True
best_max_features = 200000

svm_classifier = svm.SVC(kernel='linear')
# Create a pipeline with the best parameters
vectorizer = CountVectorizer(
    analyzer=best_analyzer,
    ngram_range=best_ngram_range,
    stop_words=best_stop_words,
    lowercase=best_lowercasing,
    max_features=best_max_features
)

tfidf_transformer = TfidfTransformer()
svm_classifier = SVC(kernel='linear')

text_clf = Pipeline([
    ('vect', vectorizer),
    ('tfidf', tfidf_transformer),
    ('clf', svm_classifier),
])

# Load data
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)

# Evaluate the model
accuracy = accuracy_score(twenty_test.target, predicted)
recall = recall_score(twenty_test.target, predicted, average='weighted')
precision = precision_score(twenty_test.target, predicted, average='weighted', zero_division='warn')
f1 = f1_score(twenty_test.target, predicted, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8335
Recall: 0.8335
Precision: 0.8399
F1 Score: 0.8337
