In [19]:
#Reading the database
import pandas as pd
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


from google.colab import drive
drive.mount("/google_drive")
WORKSPACE_PATH = "/google_drive/My Drive/MBD Capstone"

xls = pd.ExcelFile(WORKSPACE_PATH+ '/ie_input_data_no_character_sin_no.xlsx')
df = pd.read_excel(xls, 'Sheet1')

  


Drive already mounted at /google_drive; to attempt to forcibly remount, call drive.mount("/google_drive", force_remount=True).


### Baseline Model NB, bag of words

In [20]:
#Choosing the columns to work and rename the columns
df = df.set_index('unique_id')
df = df[['text_answers','show_up']]
df.columns = ['text','labels']

In [21]:
# splitting data in train and test
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, stratify=df['labels'])


# Resampling the dataset to balance it
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_trainR, y_trainR = ros.fit_resample(X_train[['text']], X_train['labels'])
X_testR, y_testR = ros.fit_resample(X_test[['text']], X_test['labels'])

In [22]:
#Create a formula that eliminates the punctuation and put words in lower case
import string

def pre_process_text(data,col='text'):

  data[col] = data[col].astype('str')
  data[col] = data[col].str.replace('\|\|',' ')
  data[col] = [s.translate(str.maketrans('', '', string.punctuation)) for s in data[col]]
  data[col] = data[col].str.lower()
  return data

In [23]:
#Create a formula that deletes everything that is not a word, deleting the stop words, and do stemming (root word)
import nltk
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download("stopwords")

import re

def process_text(raw_text):

    letters_only = re.sub("[^a-zA-Z]", " ",raw_text) 
    words = letters_only.lower().split()
    
    stops = set(stopwords.words("spanish")) 
    stops.remove('no')
    not_stop_words = [w for w in words if not w in stops]
    
    stemmer = SnowballStemmer('spanish')
    stemmed = [stemmer.stem(word) for word in not_stop_words]
    
    return( " ".join( stemmed ))  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
#Apply formulas to the train and test set
X_trainR = pre_process_text(X_trainR,'text')
X_trainR['text'] = X_trainR['text'].apply(lambda x: process_text(x))

X_testR = pre_process_text(X_testR,'text')
X_testR['text'] = X_testR['text'].apply(lambda x: process_text(x))

  import sys


In [25]:
# Crate a dataframe of bag of words with tf_idf weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
count_vect = CountVectorizer(analyzer = "word")
train_features = count_vect.fit_transform(X_trainR['text'])
test_features = count_vect.transform(X_testR['text'])

tfidf = TfidfTransformer(norm="l2")
train_text_tfidf_features = tfidf.fit_transform(train_features)
test_text_tfidf_features = tfidf.fit_transform(test_features)   

In [26]:
# apply Naive Baises to the Data
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB()

clf.fit(train_text_tfidf_features,y_trainR)

y_train_pred = clf.predict(train_text_tfidf_features)
y_test_pred = clf.predict(test_text_tfidf_features)

acc_train = accuracy_score(y_trainR, y_train_pred)
acc_test = accuracy_score(y_testR, y_test_pred)

print(f' train accuracy: {acc_train:.3}, test accuracy: {acc_test:.3}')

 train accuracy: 0.694, test accuracy: 0.383


### Baseline model Logit and Random Forest with numeric values

In [27]:
#Read the data
xls = pd.ExcelFile(WORKSPACE_PATH+ '/JLB/numeric_vals.xlsx')
df_num = pd.read_excel(xls, 'Sheet1')

In [28]:
# list(df.index)

In [29]:
df_num = df_num.merge(df.reset_index()[['unique_id']])

In [30]:
#columns to use that are the ones that are numeric
cols = list(df_num.columns[df_num.columns.str.contains('\|')])
df_num = df_num[cols+['show_up']]


In [31]:

# splitting data in train and test
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df_num, stratify=df_num['show_up'])

# Resampling the dataset to balance it
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_trainR, y_trainR = ros.fit_resample(X_train[cols], X_train['show_up'])
X_testR, y_testR = ros.fit_resample(X_test[cols], X_test['show_up'])

In [32]:
#scaling data to use with logit
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X_trainR)
X_trainR = pd.DataFrame(scaled, columns= X_trainR.columns)
scaled_t = scaler.transform(X_testR)
X_testR = pd.DataFrame(scaled_t, columns= X_testR.columns)

In [33]:
#apply logit
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state =41)

clf.fit(X_trainR,y_trainR)

y_train_pred = clf.predict(X_trainR)
y_test_pred = clf.predict(X_testR)

acc_train = accuracy_score(y_trainR, y_train_pred)
acc_test = accuracy_score(y_testR, y_test_pred)

print(f' train accuracy: {acc_train:.3}, test accuracy: {acc_test:.3}')

 train accuracy: 0.562, test accuracy: 0.419


In [34]:
#apply random forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state =41)

clf.fit(X_trainR,y_trainR)

y_train_pred = clf.predict(X_trainR)
y_test_pred = clf.predict(X_testR)

acc_train = accuracy_score(y_trainR, y_train_pred)
acc_test = accuracy_score(y_testR, y_test_pred)

print(f' train accuracy: {acc_train:.3}, test accuracy: {acc_test:.3}')

 train accuracy: 0.999, test accuracy: 0.472
