# Skill Classification Using SVM and Random Forest.

# 1) Data Preprocessing

Importing Libraries for data analysis


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#load the dataset using Pandas
df = pd.read_csv('data_new.csv')

In [5]:
#Head of the dataset
df.head()

Unnamed: 0,Skills\t,label
0,Python Django\tSQLite\tMysql\tmongoDB\t,Backend Developer
1,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer
2,Python\tDjango\tFlask\tSQLite\tMysql,Backend Developer
3,Java\tSpring\tHybernet\tSQL Server\tMysql,Backend Developer
4,Python\tFlask\tSQLite\tMysql\tmongoDB\t,Backend Developer


In [6]:
# Check for missing values
df.isna().sum()

Skills\t    0
label       0
dtype: int64

In [7]:
#Tail of the dataset
df.tail()

Unnamed: 0,Skills\t,label
259,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer
260,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer
261,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer
262,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer
263,HTML\tJava Script\tCSS\tMysql\tmongoDB,Frontend Developer


In [8]:
#Describing the statistical summery
df.describe()

Unnamed: 0,Skills\t,label
count,264,264
unique,27,4
top,C programming\tSQLite\tMongo DB\tMysql\tCpp,Backend Developer
freq,65,66


In [9]:
#Taking the count of labels
df['label'].value_counts()/ (len(df))

Backend Developer     0.25
Frontend Developer    0.25
Data Scientist        0.25
Embedded              0.25
Name: label, dtype: float64

In [10]:
df['label'].value_counts()

Backend Developer     66
Frontend Developer    66
Data Scientist        66
Embedded              66
Name: label, dtype: int64

In [11]:
# Divide the dataset into training and tesing
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test =  train_test_split(df['Skills\t'], df['label'], test_size = 0.3, random_state =0, shuffle = True)

In [13]:
X_train.shape

(184,)

In [14]:
X_test.shape

(80,)

In [15]:
# CountVectorizer:It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

# 2) Building the Model (Random Forest)

Random Forest Model:  "Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset and takes the average to improve the predictive accuracy of that dataset."

TfidfVectorizer: Term frequency-inverse document frequency is a text vectorizer that transforms the text into a usable vector. It combines 2 concepts, Term Frequency (TF) and Document Frequency (DF).

Pipeline:Scikit-learn's pipeline class is a useful tool for encapsulating multiple different transformers alongside an estimator into one object, so that you only have to call your important methods once ( fit() , predict() , etc).

In [16]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

In [17]:
classifier = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", RandomForestClassifier(n_estimators=100))])

In [18]:
classifier.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier())])

# 3) Predicting the results (Random Forest)

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
y_test, y_pred

(136    Frontend Developer
 101              Embedded
 240              Embedded
 8       Backend Developer
 181        Data Scientist
               ...        
 89               Embedded
 108              Embedded
 238              Embedded
 255    Frontend Developer
 223     Backend Developer
 Name: label, Length: 80, dtype: object,
 array(['Frontend Developer', 'Embedded', 'Embedded', 'Backend Developer',
        'Data Scientist', 'Data Scientist', 'Backend Developer',
        'Embedded', 'Backend Developer', 'Frontend Developer', 'Embedded',
        'Frontend Developer', 'Embedded', 'Data Scientist',
        'Data Scientist', 'Data Scientist', 'Backend Developer',
        'Embedded', 'Frontend Developer', 'Data Scientist',
        'Backend Developer', 'Frontend Developer', 'Backend Developer',
        'Backend Developer', 'Frontend Developer', 'Frontend Developer',
        'Data Scientist', 'Data Scientist', 'Embedded', 'Embedded',
        'Data Scientist', 'Frontend Developer', '

In [21]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [22]:
accuracy_score(y_test, y_pred)

1.0

In [23]:
confusion_matrix(y_test, y_pred)

array([[17,  0,  0,  0],
       [ 0, 24,  0,  0],
       [ 0,  0, 23,  0],
       [ 0,  0,  0, 16]])

In [24]:
print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

 Backend Developer       1.00      1.00      1.00        17
    Data Scientist       1.00      1.00      1.00        24
          Embedded       1.00      1.00      1.00        23
Frontend Developer       1.00      1.00      1.00        16

          accuracy                           1.00        80
         macro avg       1.00      1.00      1.00        80
      weighted avg       1.00      1.00      1.00        80



SVM: Support Vector Machine(SVM) is a supervised machine learning algorithm used for both classification and regression. Though we say regression problems as well its best suited for classification. The objective of SVM algorithm is to find a hyperplane in an N-dimensional space that distinctly classifies the data points.



# 4) Building the Model (SVM)

In [25]:
from sklearn.svm import SVC

In [26]:
svm = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", SVC(C = 100, gamma='auto'))])

In [27]:
svm.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', SVC(C=100, gamma='auto'))])

# 5) Predicting the results (SVM)

In [28]:
y_pred = svm.predict(X_test)

In [29]:
accuracy_score(y_test, y_pred)

1.0

In [30]:
confusion_matrix(y_test, y_pred)

array([[17,  0,  0,  0],
       [ 0, 24,  0,  0],
       [ 0,  0, 23,  0],
       [ 0,  0,  0, 16]])

In [31]:
print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

 Backend Developer       1.00      1.00      1.00        17
    Data Scientist       1.00      1.00      1.00        24
          Embedded       1.00      1.00      1.00        23
Frontend Developer       1.00      1.00      1.00        16

          accuracy                           1.00        80
         macro avg       1.00      1.00      1.00        80
      weighted avg       1.00      1.00      1.00        80



In [32]:
#Single Data Prediction

In [33]:
test1 = ['Python Django	SQLite	Mysql	mongoDB']
test2 = ['HTML	Java Script	CSS	Mysql	mongoDB']
test3 = ['Python	Django	Flask	SQLite	Mysql']

In [34]:
print(classifier.predict(test1))



['Backend Developer']


In [35]:
print(classifier.predict(test2))

['Frontend Developer']


In [36]:
print(classifier.predict(test3))

['Backend Developer']


In [37]:
print(svm.predict(test1))
print(svm.predict(test2))
print(svm.predict(test3))

['Backend Developer']
['Frontend Developer']
['Backend Developer']
