In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [20]:
### Data Pre-processing ###
data = pd.read_csv('random_f_data.csv').dropna() # there are five rows with na value on the text column, drop it out.
text = data['expDescription'] # get the text data
data.drop(['Unnamed: 0', 'ownerID','school', 'expDescription'],axis=1,inplace=True) # remove unrelated columns
tv = TfidfVectorizer() # TF-IDF vectorizer
X = tv.fit_transform(text) # Vectorize the text data with TF-IDF
skills = ['4', '8', '10', '11', '19'] # create a list of all skills we want to predict for later use

In [3]:
### Grid Search ###
def gridSearch(skillID, model, param):
    y = data[str(skillID)]  # get the target column
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # make train test dataset
    gscv = GridSearchCV(model, param, scoring='f1', n_jobs=-1) # use cross validation to search the best parameters
    gscv.fit(X_train, y_train)
    y_predict = gscv.predict(X_test) # predict the result
    cm = confusion_matrix(y_test, y_predict) # get the confusion matrics
    print('For skill', str(skillID), '\n', 'Confusion Matrix:\n', cm, '\n', 'best parameters:', gscv.best_params_)

In [4]:
rf = RandomForestClassifier(class_weight={0:0.1, 1:0.9})    # create random forest model
rf_param = {'min_samples_split': range(2,7,2),       # parameters for grid search
              'n_estimators': range(50,151,20)}

for skill in skills:
    gridSearch(skill, rf, rf_param)

For skill 4 
 Confusion Matrix:
 [[514 148]
 [194 132]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}
For skill 8 
 Confusion Matrix:
 [[528 151]
 [192 117]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 50}
For skill 10 
 Confusion Matrix:
 [[708  61]
 [196  23]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 50}
For skill 11 
 Confusion Matrix:
 [[377 222]
 [144 245]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}
For skill 19 
 Confusion Matrix:
 [[421 198]
 [200 169]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}


In [5]:
gb = GradientBoostingClassifier()    # create gradient boosting model
gb_param = {'min_samples_split': range(2,7,2),   # parameters for grid search
              'n_estimators': range(50,151,20)}

for skill in skills:
    gridSearch(skill, gb, rf_param)

For skill 4 
 Confusion Matrix:
 [[632  30]
 [268  58]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}
For skill 8 
 Confusion Matrix:
 [[648  31]
 [270  39]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}
For skill 10 
 Confusion Matrix:
 [[755  14]
 [195  24]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 150}
For skill 11 
 Confusion Matrix:
 [[532  67]
 [248 141]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}
For skill 19 
 Confusion Matrix:
 [[577  42]
 [300  69]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 150}


In [12]:
### Use best parameters found on the grid search to build models ###
svc = SVC(gamma='auto')
rf = RandomForestClassifier(class_weight={0:0.1, 1:0.9}, min_samples_split=6, n_estimators=150)
gb = GradientBoostingClassifier(min_samples_split=6, n_estimators=150)
voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')

In [18]:
### train and run the voting classifier, get the confusion matrics ###
for skill in skills:
    y = data[skill]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')
    voting.fit(X_train, y_train)
    y_pred = voting.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print('For skill', skill, '\n', 'Confusion Matrix:\n', cm)

For skill 4 
 Confusion Matrix:
 [[650  12]
 [288  38]]
For skill 8 
 Confusion Matrix:
 [[666  13]
 [279  30]]
For skill 10 
 Confusion Matrix:
 [[767   2]
 [211   8]]
For skill 11 
 Confusion Matrix:
 [[547  52]
 [264 125]]
For skill 19 
 Confusion Matrix:
 [[597  22]
 [318  51]]
