In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score, auc as sk_auc, roc_curve, precision_score, recall_score
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold
import pickle
import random
import re
import keras

Using TensorFlow backend.


In [2]:
df = pd.read_csv("data/course_descriptions.csv", usecols=[1,2,3,4,5,6])

In [3]:
df.sample(10)

Unnamed: 0,description,length,course,ouid,fac,inst
603,The course is built up around 3 design project...,788,TPD4127,840,AD,AD-ID
3899,Chemical engineering thermodynamics is a basis...,1631,TKP4107,869,NV,NV-IKP
300,See: http://nordkurs.org/There will be present...,220,NORD2261,1080,HF,HF-ISL
728,1. Knowledge. The student has knowledge of fun...,601,MA3002,828,IE,IE-IMF
2534,Nursing - field and function II:\r\r\nNursing ...,2936,SPL2012,1102,MH,MH-IHG
2465,"After completing the course, the student \r\r\...",3813,FH3102,1102,MH,MH-IHG
1531,This course will provide an overview of the mo...,1627,TGB4270,1216,IV,IV-IGP
2790,"Statically determinate structures: Beams, plan...",1419,TKT4126,843,IV,IV-KT
1227,The aim is to establish methods for probabilis...,2178,MR8502,838,IV,IV-IMT
3400,"The course is taught every second year, next t...",2280,TM8101,831,IE,IE-IIK


In [4]:
df = df.dropna() #loc[~df["description"].isnull()]

In [5]:
df.loc[df.course=="IT3708"].values

array([['The main focus of the course is to build intelligent systems based on two key natural concepts: evolution by natural selection and swarm intelligence.  Such intelligent systems have thousands of useful applications in fields as diverse as control theory, telecommunications, music and art.  This course discusses both methods in great detail along with providing a bit of the biological basis for each.Lecture slides, a textbook (possibly 2).  Textbooks are chosen  at the beginning of the semester.Students will get both theoretical and practical programming experience with two of the best known sub-symbolic AI methods: evolutionary algorithms and swarm intelligence algorithms. ',
        682, 'IT3708', 827, 'IE', 'IE-IDI']], dtype=object)

In [56]:
df.fac.value_counts()

IV    913
HF    634
IE    616
SU    504
NV    423
OK    384
MH    296
AD    213
Name: fac, dtype: int64

In [6]:
def remove_punctuation(document):
    return "".join([ (c if c not in string.punctuation+"\n\r\t" else " ") for c in document])

def tokenize(document):
    return [w.lower() for w in remove_punctuation(document).split(" ") if len(w)>0]

In [7]:
stoplist = [l.strip() for l in open("stopwords.txt", "r").readlines()]

In [8]:
stoplist

['a',
 'able',
 'about',
 'across',
 'after',
 'all',
 'almost',
 'also',
 'am',
 'among',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'but',
 'by',
 'can',
 'cannot',
 'could',
 'dear',
 'did',
 'do',
 'does',
 'either',
 'else',
 'ever',
 'every',
 'for',
 'from',
 'get',
 'got',
 'had',
 'has',
 'have',
 'he',
 'her',
 'hers',
 'him',
 'his',
 'how',
 'however',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'its',
 'just',
 'least',
 'let',
 'like',
 'likely',
 'may',
 'me',
 'might',
 'most',
 'must',
 'my',
 'neither',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'often',
 'on',
 'only',
 'or',
 'other',
 'our',
 'own',
 'rather',
 'said',
 'say',
 'says',
 'she',
 'should',
 'since',
 'so',
 'some',
 'than',
 'that',
 'the',
 'their',
 'them',
 'then',
 'there',
 'these',
 'they',
 'this',
 'tis',
 'to',
 'too',
 'twas',
 'us',
 'wants',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'would

In [36]:
y = df["fac"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(df["description"], y, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2987,), (2987,), (996,), (996,))

In [57]:
y_test.value_counts()

IV    228
HF    159
IE    154
SU    126
NV    106
OK     96
MH     74
AD     53
Name: fac, dtype: int64

In [58]:
y_train.value_counts()

IV    685
HF    475
IE    462
SU    378
NV    317
OK    288
MH    222
AD    160
Name: fac, dtype: int64

In [37]:
vec = TfidfVectorizer(tokenizer=tokenize, stop_words=stoplist, ngram_range=(1,2))
trn_vec= vec.fit_transform(X_train.values)
test_vec = vec.transform(X_test.values)

In [38]:
X_train = pd.concat([X_train, pd.get_dummies(y_train)], axis=1)
X_test = pd.concat([X_test, pd.get_dummies(y_test)], axis=1)
X_train.shape, X_test.shape

((2987, 9), (996, 9))

In [39]:
X_train.sample(5)

Unnamed: 0,description,AD,HF,IE,IV,MH,NV,OK,SU
3667,PLU8022 is focused on qualitative analysis of ...,0,0,0,0,0,0,0,1
2020,The course treats modelling of water resources...,0,0,0,1,0,0,0,0
2436,Topic 1: The context of integrated care.\r\n\r...,0,0,0,0,1,0,0,0
297,"Economic design criteria, investment and socio...",0,0,0,1,0,0,0,0
1095,The course is a study of religions originating...,0,1,0,0,0,0,0,0


In [40]:
label_cols = df["fac"].astype(str).unique().tolist()
print(label_cols)
preds = np.zeros((len(X_test), len(label_cols)))
preds.shape

['IE', 'HF', 'IV', 'AD', 'SU', 'MH', 'NV', 'OK']


(996, 8)

In [41]:
trn_vec.shape,test_vec.shape,  X_test.shape

((2987, 224171), (996, 224171), (996, 9))

In [42]:
def prior(y_i, y):
    '''
    y_i is either 1 or 0.
    y is numpy array of labels
    '''
    p = trn_vec[y==y_i].sum(0) # Number of documents in trn_doc with given label
    return (p+1) / ((y==y_i).sum()+1) # Fraction of all documents with given label

In [43]:
def get_mdl(y):
    y = y.values #pandas Series to numpy array
    r = np.log(prior(1,y) / prior(0,y)) #Log likelihood ratio for both possibilities
    m = LogisticRegression(C=4, dual=True) # Logistic regression model
    x_nb = trn_vec.multiply(r) # Multiply the Tf-idf features with this ratio
    return m.fit(x_nb, y), r

In [45]:
TRAIN = True
SAVE = False

In [46]:
models = {}
rs = {}
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(X_train[j])
    if TRAIN:
        preds[:,i] = m.predict_proba(test_vec.multiply(r))[:,1]
    else:
        if SAVE:
            np.save(open("webapp/r_"+j+".npy", "wb"), r)
            pickle.dump(m, open("webapp/"+j+"_model.pkl", "wb"))
    models[j] = m
    rs[j] = r

fit IE
fit HF
fit IV
fit AD
fit SU
fit MH
fit NV
fit OK


In [86]:
df_preds = pd.DataFrame(preds, columns=label_cols)

for c in df_preds.columns:
    df_preds[c+"_predicted"] = (df_preds.max(axis=1)==df_preds[c]).astype(int)

oh_y_test = pd.get_dummies(y_test)
oh_y_test = oh_y_test.rename({c:c+"_actual" for c in oh_y_test.columns},axis=1)

df_preds = pd.concat([df_preds, oh_y_test.reset_index()], axis=1)

res_df = pd.DataFrame(confusion_matrix(df_preds.loc[:,[c+"_actual" for c in label_cols]].values.argmax(1), df_preds.loc[:,[c+"_predicted" for c in label_cols]].values.argmax(1)), index=label_cols, columns=label_cols)

Unnamed: 0,IE,HF,IV,AD,SU,MH,NV,OK
0,0.024581,0.101164,0.033143,0.008138,0.127713,0.036769,0.017182,0.020509
1,0.097901,0.011977,0.716195,0.005418,0.012404,0.007123,0.016306,0.010501
2,0.026444,0.804153,0.040122,0.00562,0.013313,0.006747,0.014179,0.011207
3,0.040366,0.013063,0.97718,0.008971,0.016292,0.007264,0.020646,0.011572
4,0.051619,0.020096,0.078256,0.006793,0.01592,0.13141,0.96062,0.011461


In [92]:
res_df

Unnamed: 0,IE,HF,IV,AD,SU,MH,NV,OK
IE,131,4,14,1,1,0,0,3
HF,2,150,3,0,4,0,0,0
IV,11,1,204,2,4,0,1,5
AD,6,0,12,34,1,0,0,0
SU,5,1,4,0,115,0,0,1
MH,5,0,3,0,3,59,4,0
NV,8,0,8,0,0,0,89,1
OK,2,2,10,0,1,0,0,81


## Next steps
- Define metrics
- Plot
- Script evolution
- Think about steps

In [93]:
import mlflow

In [94]:
import os
from mlflow import log_metric, log_param, log_artifact

# Log a parameter (key-value pair)
log_param("param1", 5)

# Log a metric; metrics can be updated throughout the run
log_metric("foo", 1)
log_metric("foo", 2)
log_metric("foo", 3)

# Log an artifact (output file)
with open("output.txt", "w") as f:
    f.write("Hello world!")
log_artifact("output.txt")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



MlflowException: Could not find a registered tracking store for: C:\Users\thotho\Documents\Repos\meetup-mlflow\mlruns. Currently registered schemes are: ['', 'file', 'databricks', 'http', 'https', 'postgresql', 'mysql', 'sqlite', 'mssql']