In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from scipy.sparse import hstack
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path to the data file in your Google Drive
data_path = "/content/drive/MyDrive/TAMU Sem 2/ISR/Clean DataSet/df_eda.pkl"

# Read the data file
df = pd.read_pickle(data_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df.head()

Unnamed: 0,Title,Body,Tags
1,good branching merge tutorial tortoisesvn,really good tutorial explain branch merge apac...,[svn]
2,asp net site map,anyone get experience create sqlbased asp net ...,"[sql, asp.net]"
3,function create color wheel,something pseudosolved many time never quite f...,[algorithm]
4,add script functionality net application,little game write c us database backend tradin...,"[c#, .net]"
5,use nested class case,work collection class use video playback recor...,"[c++, oop]"


In [5]:
#Combining Title and body
df['Data'] = df['Title'] + ". " + df['Body']

In [6]:
df.head()

Unnamed: 0,Title,Body,Tags,Data
1,good branching merge tutorial tortoisesvn,really good tutorial explain branch merge apac...,[svn],good branching merge tutorial tortoisesvn. rea...
2,asp net site map,anyone get experience create sqlbased asp net ...,"[sql, asp.net]",asp net site map. anyone get experience create...
3,function create color wheel,something pseudosolved many time never quite f...,[algorithm],function create color wheel. something pseudos...
4,add script functionality net application,little game write c us database backend tradin...,"[c#, .net]",add script functionality net application. litt...
5,use nested class case,work collection class use video playback recor...,"[c++, oop]",use nested class case. work collection class u...


In [7]:
#TFIDF
vectorizer = TfidfVectorizer(analyzer='word',
                              min_df=0.0,
                              max_df=1.0,
                              strip_accents=None,
                              encoding='utf-8',
                              preprocessor=None,
                              token_pattern=r"(?u)\S\S+",
                              max_features=1000)

tfidf_vec = vectorizer.fit_transform(df['Data'])

In [8]:
tfidf_vec

<80393x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2636660 stored elements in Compressed Sparse Row format>

In [9]:
#Tags
target_values = df['Tags']
mlb = MultiLabelBinarizer()
target_binary_labels = mlb.fit_transform(target_values)
target_binary_classes = mlb.classes_

In [10]:
target_binary_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
target_binary_classes

array(['.net', 'ajax', 'algorithm', 'android', 'angularjs', 'api',
       'arrays', 'asp.net', 'asp.net-mvc', 'asp.net-mvc-3', 'bash', 'c',
       'c#', 'c++', 'c++11', 'cocoa', 'cocoa-touch', 'css', 'css3',
       'database', 'datetime', 'debugging', 'delphi', 'django', 'eclipse',
       'emacs', 'entity-framework', 'exception', 'facebook', 'function',
       'gcc', 'generics', 'git', 'google-chrome', 'haskell', 'hibernate',
       'html', 'html5', 'http', 'image', 'ios', 'ipad', 'iphone', 'java',
       'javascript', 'jquery', 'json', 'linq', 'linux', 'list', 'math',
       'matlab', 'maven', 'mongodb', 'multithreading', 'mysql', 'node.js',
       'numpy', 'objective-c', 'oop', 'optimization', 'oracle', 'osx',
       'performance', 'perl', 'php', 'postgresql', 'python', 'qt', 'r',
       'regex', 'rest', 'ruby', 'ruby-on-rails', 'ruby-on-rails-3',
       'scala', 'security', 'shell', 'spring', 'sql', 'sql-server',
       'sql-server-2008', 'string', 'svn', 'swift', 'swing', 'template

In [12]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vec, target_binary_labels, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)


In [13]:
#Training the model
classifier = CalibratedClassifierCV(LinearSVC(verbose=0))
clf = MultiOutputClassifier(classifier)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)

In [24]:
from sklearn.metrics import hamming_loss, jaccard_score, precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings

warnings.simplefilter("ignore")

# Calculate evaluation metrics
precision_avg = precision_score(y_test, y_pred, average='micro')
recall_avg = recall_score(y_test, y_pred, average='micro')
f1_avg = f1_score(y_test, y_pred, average='micro')
hamming_avg = hamming_loss(y_test, y_pred)
jaccard_avg = jaccard_score(y_test, y_pred, average='micro')

print("Classifier: LinearSVC")
print("avg precision: {}".format(precision_avg))
print("avg recall: {}".format(recall_avg))
print("avg f1-score: {}".format(f1_avg))
print("avg hamming loss: {}".format(hamming_avg))
print("avg jaccard score: {}".format(jaccard_avg))

# Calculate per-tag metrics
hamming_per_tag = []
jaccard_per_tag = []
precision_per_tag, recall_per_tag, fscore_per_tag, support_per_tag = score(y_test, y_pred)
for i, (test, pred) in enumerate(zip(y_test.T, y_pred.T)):
    hamming_per_tag.append(hamming_loss(test, pred))
    jaccard_per_tag.append(jaccard_score(test, pred))

metrics_per_tag = pd.DataFrame(data=[precision_per_tag, recall_per_tag, fscore_per_tag, support_per_tag, hamming_per_tag, jaccard_per_tag],
                               index=["Precision", "Recall", "F-1 score", "True count", "Hamming loss", "Jaccard score"],
                               columns=target_binary_classes)

print(metrics_per_tag)


Classifier: LinearSVC
avg precision: 0.8014119790480528
avg recall: 0.41618702199795
avg f1-score: 0.5478606087340098
avg hamming loss: 0.01083711673611543
avg jaccard score: 0.3772782503037667
                     .net        ajax   algorithm      android   angularjs  \
Precision        0.695035    0.750000    0.763780     0.962822    0.900000   
Recall           0.142029    0.365854    0.451163     0.778720    0.527027   
F-1 score        0.235860    0.491803    0.567251     0.861040    0.664773   
True count     690.000000  123.000000  215.000000  1297.000000  222.000000   
Hamming loss     0.039493    0.005784    0.009205     0.020275    0.007339   
Jaccard score    0.133697    0.326087    0.395918     0.755988    0.497872   

                     api      arrays     asp.net  asp.net-mvc  asp.net-mvc-3  \
Precision       0.600000    0.580952    0.748344     0.786667       0.333333   
Recall          0.042857    0.297561    0.330409     0.475806       0.014085   
F-1 score       0.0

NameError: name 'plot_micro_average_roc' is not defined

In [17]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])