# Scikit-Learn

# 1. Data

In [9]:
# Load Data
import numpy as np 

def load_data(path = "/Users/charles/MLGT/SS2/Data/"):
    # Get data from each line with label, doc_id, index & tfidf of its vocab
    def sparse_to_dense(sparse_r_d, vocab_size):
        # Init list size vocal size to store vocal 
        r_d = [0.0 for _ in range(vocab_size)]
            
        # Split space & : in context data of each line 
        # Get index (id vocal of each line) & tfidfs
        indices_and_tfidfs = sparse_r_d.split()
        for index_and_tfidf in indices_and_tfidfs:
            index = int(index_and_tfidf.split(':')[0])
            tfidf = float(index_and_tfidf.split(':')[1])
            r_d[index] = tfidf
        return np.array(r_d)    
                
    # Open file (newsgroup, id, context)
    with open(path + "data_tf_idf.txt") as f:
        data_lines = f.read().splitlines()
    # Get size file vocal TF-IDF
    with open(path + "words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())

    # Member store info of data points: tf_idf, news group, file name of text d
    data, labels = [], []
    # Iterating sequence of pairs with counter
    for data_id, d in enumerate(data_lines):
        features = d.split('<fff>')
        label, doc_id = int(features[0]), int(features[1])
        r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)

        # Append data & labels
        data.append(r_d)
        labels.append(label)
    return data, np.array(labels)

In [49]:
# Load data
X, y = load_data()

# Split Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Use csr_matrix to create a sparse matrix with efficient row slicing
# Improve time to computing
from scipy.sparse import csr_matrix
X = csr_matrix(X)

# 2. Accuracy

In [10]:
# Loss Function
def compute_accuracy(predicted_y, expected_y):
    # Check boolean True = 1, False = 0 
    matches = np.equal(predicted_y, expected_y)
    accuracy = np.sum(matches.astype(float)) / len(expected_y)
    return accuracy

# 3. KMeans

In [52]:
from sklearn.cluster import KMeans
# Training to collecting parameter
kmeans = KMeans(random_state=42,             # seed value
                tol = 1e-3,                  # threshold 
                n_init= 5).fit(X_train)         # number of times running with differently init centroid

In [58]:
predicted_y_kmeans = kmeans.predict(X_test)

# Accuracy
accuracy_kmeans = compute_accuracy(predicted_y = predicted_y_kmeans, expected_y = y_test)
accuracy_kmeans

0.03353140916808149

# 4. SVM 

## 4.1. Linear SVMs

In [59]:
# Load data
X, y = load_data()

# Use csr_matrix to create a sparse matrix with efficient row slicing
# Improve time to computing
from scipy.sparse import csr_matrix
X = csr_matrix(X)

# Split Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [63]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(
    C=10.0,             # penalty coeff
    tol=0.001,          # tolerance for stopping criteria
    verbose=True        # whether prints out logs or not
).fit(X_train, y_train)

predicted_y_svm = classifier.predict(X_test)
accuracy = compute_accuracy(predicted_y=predicted_y_svm, expected_y=y_test)
accuracy

[LibLinear]...........*.*
optimization finished, #iter = 122
Objective value = -476.958797
nSV = 1350
..........................*..*
optimization finished, #iter = 288
Objective value = -928.932852
nSV = 1876
............................*.......***
optimization finished, #iter = 359
Objective value = -1027.136604
nSV = 1830
.........................*
optimization finished, #iter = 253
Objective value = -1255.974334
nSV = 2107
.......................***.
optimization finished, #iter = 240
Objective value = -813.172786
nSV = 1844
............................***
optimization finished, #iter = 285
Objective value = -677.013917
nSV = 1745
.........................*
optimization finished, #iter = 253
Objective value = -856.913884
nSV = 1829
....................*.*
optimization finished, #iter = 215
Objective value = -596.689673
nSV = 1822
................*.**
optimization finished, #iter = 174
Objective value = -384.038738
nSV = 1604
...........*.
optimization finished, #iter = 120
Objective

0.9142614601018676

## 4.2 Kernel SVMs

In [2]:
# Load data
X, y = load_data()

# Use csr_matrix to create a sparse matrix with efficient row slicing
# Improve time to computing
from scipy.sparse import csr_matrix
X = csr_matrix(X)

# Split Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=79)

In [11]:
from sklearn.svm import SVC

classifier_ker = SVC(
    C=50.0,             # penalty coeff
    kernel="rbf",       
    gamma=0.1,
    tol=0.001,          # tolerance for stopping criteria
    verbose=True        # whether prints out logs or not
).fit(X_train, y_train)

predicted_y_svc = classifier_ker.predict(X_test)
accuracy = compute_accuracy(predicted_y=predicted_y_svc, expected_y=y_test)
accuracy

[LibSVM]*.*
optimization finished, #iter = 1259
obj = -588.837672, rho = 0.325817
nSV = 551, nBSV = 0
*.*
optimization finished, #iter = 1164
obj = -506.301100, rho = 0.229902
nSV = 513, nBSV = 0
*.*
optimization finished, #iter = 1275
obj = -533.746849, rho = 0.271207
nSV = 536, nBSV = 0
*.*
optimization finished, #iter = 1227
obj = -535.910795, rho = 0.266039
nSV = 532, nBSV = 0
*.*
optimization finished, #iter = 1208
obj = -519.954914, rho = 0.244365
nSV = 530, nBSV = 0
*.*
optimization finished, #iter = 1213
obj = -548.006673, rho = 0.363611
nSV = 551, nBSV = 0
*.*
optimization finished, #iter = 1278
obj = -553.751935, rho = 0.279694
nSV = 551, nBSV = 0
*.*
optimization finished, #iter = 1322
obj = -521.020984, rho = 0.184302
nSV = 540, nBSV = 0
*.*
optimization finished, #iter = 1277
obj = -536.057834, rho = 0.239926
nSV = 526, nBSV = 0
*.*
optimization finished, #iter = 1060
obj = -449.264274, rho = 0.094582
nSV = 466, nBSV = 0
*.*
optimization finished, #iter = 1144
obj = -541.2

0.9044991511035654