# Classifying NewsGroup topics with Support Vector Machines (NLP)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

In [2]:
fetch_20newsgroups

<function sklearn.datasets._twenty_newsgroups.fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False)>

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [5]:
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [11]:
iris= load_iris(as_frame=True)
X=iris.data[["petal length (cm)","petal width (cm)"]].values
y=(iris.target==2) # Iris virginia class

svm_clf= make_pipeline(StandardScaler(), 
                      LinearSVC(C=1, random_state=40))

svm_clf.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(C=1, random_state=40))])

In [10]:
X_new=[[5.5,1.7],[3.0,1.0]]
svm_clf.predict(X_new)

array([ True, False])

In [12]:
svm_clf.decision_function(X_new)

array([ 0.66163641, -3.07415868])

In [16]:
# Nonlinear SVM Classification- Many datasets are not linearly separable

from sklearn.datasets import make_moons
from sklearn.preprocessing import PolynomialFeatures

X,y= make_moons(n_samples=90, noise=0.10, random_state=40)


polynomial_svm_clf= make_pipeline(
    PolynomialFeatures(degree=2),
StandardScaler(),
LinearSVC(C=10, max_iter=10_000, random_state=40)
)

polynomial_svm_clf.fit(X,y)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('standardscaler', StandardScaler()),
                ('linearsvc',
                 LinearSVC(C=10, max_iter=10000, random_state=40))])

In [18]:
# Polynomial Kernel

from sklearn.svm import SVC

poly_kernel_svm_clf= make_pipeline(StandardScaler(),
                                  SVC(kernel="poly", degree=2, coef0=1, C=3))



poly_kernel_svm_clf.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=3, coef0=1, degree=2, kernel='poly'))])

In [20]:


rbf_kernel_svm_clf= make_pipeline(StandardScaler(),
                                 SVC(kernel="rbf", gamma=7, C=0.009))

rbf_kernel_svm_clf.fit(X,y)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=0.009, gamma=7))])

In [23]:
# SVM Regression- 

from sklearn.svm import LinearSVR

np.random.seed(40)

X=2*np.random.rand(50,8)

y=4+3*X[:,0] + np.random.randn(50)

svm_reg= make_pipeline(StandardScaler(),
                      LinearSVR(epsilon=0.5, random_state=40))

svm_reg.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvr', LinearSVR(epsilon=0.5, random_state=40))])

In [24]:
# reducing Eepsilon increases the number of support vectors which regularizes the model
from sklearn.svm import SVR

svm_poly_reg= make_pipeline(StandardScaler(),
                      SVR(kernel="poly", degree=5, epsilon=0.1, C=0.001))

svm_poly_reg.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(C=0.001, degree=5, kernel='poly'))])