### Import Libraries

In [1]:
import numpy as np
import os
import pandas as pd
import inflect
# next we can import some sklearn libraries to start working with stuff
## transformers and pipline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
## model selectors
from sklearn.model_selection import train_test_split, cross_val_score
## models
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
## feature extarctors
from sklearn.preprocessing import OneHotEncoder
from average_word_length_extractor import AverageWordLengthExtractor
from question_extractor import QuestionExtractor
from int_to_words_extractor import NumberStringExtractor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Reading data

In [2]:
# read data
data = pd.read_excel('../data/sample_data/EaxampleData.xlsx')

In [3]:
# convert yes to True(1) and no to False(0)
data = data.replace(to_replace={'yes': 1, 'no': 0})

In [4]:
# features over which we have to predict
prediction_columns = ['stringedQuestion',
                   'leading', 'Symptom', 'PMH', 'MEDS', 'ALLG', 'FAMHx', 'lifestyle',
                   'pysch', 'SOCHx', 'sexualHistory', 'substanceUse', 'PE', 'FORM',
                   'supportProvision', 'transition']

In [5]:
# extract numerical columns
numerical_columns = data.select_dtypes(np.number).columns

In [6]:
def add_patient_physician_interactions(data, numerical_columns):
    assert type(data) == pandas.core.frame.DataFrame
    # to combine data from two columns
    comm_df = pd.DataFrame({'original_semantics': data['semanticType'],
                            'shifted_semantics' : data['semanticType'].shift(-1),
                            'original_speaker': data['speakerID'],
                            'shifted_speaker': data['speakerID'].shift(-1)}, index=data.index)

    
    data['comparisons'] = comm_df.apply(lambda x: 1 if (x.original_semantics in ['openQuestion', 'question']) and 
                  (x.shifted_semantics == 'statement') and
                  (x.original_speaker == 'doctor') and
                  (x.shifted_speaker == 'patient') else 0, axis = 1)

#     data.head()
    indexes = data.loc[data.comparisons == 1].index 
    data.loc[data.comparisons == 1,numerical_columns] = data.loc[data.comparisons == 1, numerical_columns] + data[numerical_columns].shift(-1).iloc[indexes,:]
#     [numerical_columns] = data[numerical_columns].shift(-1)
    return data

In [9]:
# extract doctor interactions
complete_data = data.loc[data.speakerID == 'doctor']

### Feature Engineering

In [10]:
# select columns by data
complete_data.drop(prediction_columns, inplace=True, axis=1)
unusable_columns = ['speakerID']
complete_data.drop(unusable_columns, inplace=True, axis=1)
# select x and y
X = complete_data
y = data.loc[complete_data.index, data.columns == prediction_columns[2]]
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

In [12]:
ngram_count = CountVectorizer(ngram_range=(1, 3), analyzer='char')
avg_word = AverageWordLengthExtractor()
question_ex = QuestionExtractor()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
numberstring= NumberStringExtractor()

## Modelling

In [51]:
#convert numbers to text
num_to_str = ColumnTransformer([
    ('numberstring', numberstring, 3)
], remainder='passthrough')

# text encoders
encoders = ColumnTransformer([
    ('ngram', ngram_count, 0),
    ('avg_word', avg_word, 0),
    ('question', question_ex, 0)
], remainder = 'passthrough')

# one hot encoding
one_hot = ColumnTransformer([
    ('one_hot', one_hot_encoder, ['semanticType'])
], remainder = 'passthrough')

# text pipeline
text_features = Pipeline([
    ('num_to_str', num_to_str),
    ('encoders', encoders)
])

# preprocessing
preprocess = Pipeline([
    ('one_hot_encoder', one_hot),
    ('text_feat', text_features)
])

# ml pipeline
ml_pipeline = Pipeline([
    ('preprocessor', preprocess),
    ('model', RandomForestClassifier(n_estimators = 100))
])

In [52]:
cross_val_score(ml_pipeline, X_train, y_train, cv = 10).mean()

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


0.8483333333333334