## 1. General use

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Pre processing

In [None]:
# Train and Test Split 
from sklearn.model_selection import train_test_split
X = df.drop(['target'], axis=1) #complete here the input 
y = df['target']#complete here the target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

# Standarize variables
# Notice to prevent data leakage from the test set, 
# we only fit our scaler to the training set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS',axis=1)) # Fit scaler to the features
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1)) # transform the features to a scaled version
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df.drop('TARGET CLASS',axis=1)) # Fit scaler to the features
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1)) # transform the features to a scaled version
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])


### Code Categorical variables in 
# in one step
cat_feats = ['cat_feat1','cat_feat2'] #list with categorical features
final_data = pd.get_dummies( df, columns=cat_feats, drop_first=True)
# getting first the dummies to explore
dummies = pd.get_dummies(df[cat_feats],drop_first=True)
df = pd.concat([df.drop('cat_feats',axis=1),dummies],axis=1)

# get metrics
from sklearn.metrics import classification_report,confusion_matrix
print('Confusion Matrix')
print(confusion_matrix(y_test,predictions))
print('-----------------------------------------------------')
print(classification_report(y_test,predictions))

# Merge dataframes (Join)
df = pd.merge(df,movie_titles,on='item_id')

# Create new variables (feature engineering)
# alternative 1
df['equal_or_lower_than_4?'] = df['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')
# alternative 2
df['loan_repaid'] = df['loan_status'].map({'Fully Paid':1,'Charged Off':0})


# Count the not na and calculate percentage
((len(df)-df.count())/len(df)*100).sort_values(ascending=False)

# calculate the mean of a variable, respect to other correlated by groups
df.groupby('total_acc')['mort_acc'].mean()

# Apply a function to column col using two or more columns 
df['col'] = df.apply(func, axis=1)

# Pass variables in an Apply statement
df[cols].apply(lambda x: apply_join(x,sep),axis=1)
df[cols].apply(func,args=[sep],axis=1)
    def func(x, sep) # Consider define the function in this way

#Select columns by its type
df.select_dtypes(include='object').info()

# to replace specific values
df['col']=df['col'].replace(['NONE', 'ANY'], 'OTHER')

## 2. Supervised Algorithms

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression 
lm = LinearRegression()
lm.fit(X_train,y_train)
predict = lm.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predict))
print('MSE:', metrics.mean_squared_error(y_test, predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predict)))


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)




### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1) #n_neighbors define la cantidad de vecinos
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

# Choose the right n_neighbors
error = []
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    predictions = knn.predict(X_test)
    error.append(np.mean(predictions != y_test))
    
# Plot elbow plot
plt.figure(figsize=(12,6))
plt.plot(range(1,50),error,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)


### Support Vector Classifier

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,y_train)
predictions = model.predict(X_test)


# GRID Search (to  optimize the C and Gamma SCV parameters)
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(), param_grid, refit=True,verbose=3)
grid.fit(X_train,y_train) # May take awhile!
grid.best_params_  #see the best parameters found by the grid
grid.best_estimator_ #see the best estimator found by the grid
grid_predictions = grid.predict(X_test) # Predict again with best parameters


## 3. Unsupervised Algorithms

### K Means

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(df)
kmeans.cluster_centers_  # get the centers
kmeans.labels_  #get the labels


### Principal Component Analysis (for feature extraction)

In [None]:
# La data debe estar escalada
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)

# Luego PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2) # Número de componentes que se quieren dejar
pca.fit(scaled_data) # se calculan los componentes
x_pca = pca.transform(scaled_data) # Se aplica la transformacion
x_pca.shape # Con esto se puede ver la nueva dimensionalidad


# Una manera interesante de ver como cada feature inicial aporta a cada Principal Component
# es mediante un Heatmap de las features orig. vs. los Principal Components
pca.components_
df_comp = pd.DataFrame(pca.components_,columns=df['feature_names'])
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)


### Recommender Systems

#### Simple Recommender by giving similar movies (content based)

In [3]:
# Now let's create a matrix that has the user ids on one access and the movie 
# title on another axis. Each cell will then consist of the rating the user 
# gave to that movie. Note there will be a lot of NaN values, because most 
# people have not seen most of the movies.
moviemat = df.pivot_table(index='user_id',columns='title',values='rating')
ratings.sort_values('num of ratings',ascending=False).head(10)

# Let's choose two movies: starwars, a sci-fi movie. And Liar Liar, a comedy.
# Now let's grab the user ratings for those two movies:
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']

# We can then use corrwith() method to get correlations between two pandas series:
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)

#Let's clean this by removing NaN values and using a DataFrame instead of a series:
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True)

# Now if we sort the dataframe by correlation, we should get the most similar 
# movies, however note that we get some results that don't really make sense. 
# This is because there are a lot of movies only watched once by users who also
# watched star wars (it was the most popular movie).
corr_starwars.sort_values('Correlation',ascending=False).head(10)

#Let's fix this by filtering out movies that have less than 100 reviews 
# (this value was chosen based off the histogram from earlier).
# Now sort the values and notice how the titles make a lot more sense:
corr_starwars = corr_starwars.join(ratings['num of ratings'])
corr_starwars[corr_starwars['num of ratings']>100].sort_values('Correlation',ascending=False).head()



In [None]:
# Otro recomendador mas avanzado usando Memory-Based Collaborative Filtering
# en el notebook 02-Advanced Recommender Systems with Python.ipynb

### NLP
More interesting resources in 01-NLP (Natural Language Processing) with Python.ipynb

In [None]:
import nltk
import string


# define a function to pre-process de text
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]


from sklearn.feature_extraction.text import CountVectorizer
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

# After the counting, the term weighting and normalization can be done with TF-IDF, 
# using scikit-learn's TfidfTransformer.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)

# To transform the entire bag-of-words corpus into TF-IDF corpus at once:
messages_tfidf = tfidf_transformer.transform(messages_bow)

# alternative you can use directly fit_transform() method

# We'll be using scikit-learn here, choosing the Naive Bayes classifier to start with:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

all_predictions = spam_detect_model.predict(messages_tfidf)

from sklearn.metrics import classification_report
print (classification_report(messages['label'], all_predictions))

# To repeat the process is better to create a pipeline and then make the data go
# throught the pipeline like any sklearn Estimator
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB())  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)



### TensorFlow + Keras (features standarizes)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

# Is important to Convert Pandas to Numpy for Keras
# Features
X = df[['feature1','feature2']].values
# Label
y = df['price'].values


# Basic model, adding layer one-by-one 
model = Sequential()

model.add(Dense(4,activation='relu'))
model.add(Dense(4,activation='relu'))
model.add(Dense(4,activation='relu'))

# Final output node for prediction
model.add(Dense(1))

model.compile(optimizer='rmsprop',loss='mse')

#### Choosing an optimizer and loss

Keep in mind what kind of problem you are trying to solve:

    # For a multi-class classification problem
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # For a binary classification problem
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # For a mean squared error regression problem
    model.compile(optimizer='rmsprop',
                  loss='mse')
                  
#### Training

Below are some common definitions that are necessary to know and understand to correctly utilize Keras:

* Sample: one element of a dataset.
    * Example: one image is a sample in a convolutional network
    * Example: one audio file is a sample for a speech recognition model
* Batch: a set of N samples. The samples in a batch are processed independently, in parallel. If training, a batch results in only one update to the model.A batch generally approximates the distribution of the input data better than a single input. The larger the batch, the better the approximation; however, it is also true that the batch will take longer to process and will still result in only one update. For inference (evaluate/predict), it is recommended to pick a batch size that is as large as you can afford without going out of memory (since larger batches will usually result in faster evaluation/prediction).
* Epoch: an arbitrary cutoff, generally defined as "one pass over the entire dataset", used to separate training into distinct phases, which is useful for logging and periodic evaluation.
* When using validation_data or validation_split with the fit method of Keras models, evaluation will be run at the end of every epoch.
* Within Keras, there is the ability to add callbacks specifically designed to be run at the end of an epoch. Examples of these are learning rate changes and model checkpointing (saving).

In [None]:
# Train the model
model.fit(x=X_train, 
          y=y_train, 
          epochs=25,
          batch_size=256,
          validation_data=(X_test, y_test), 
          )

# Evaluate
model.history.history['loss'] # To get the loss history

# Calculate the loss for every set
training_score = model.evaluate(X_train,y_train,verbose=0)
test_score = model.evaluate(X_test,y_test,verbose=0)
# Or better get the values from history
losses = pd.DataFrame(model.history.history)

# Predict (it has to be standarized)
test_predictions = model.predict(X_test)
### use predict_classes when is classification problem
#predictions = model.predict_classes(X_test)

# To calculate the error
from sklearn.metrics import mean_absolute_error,mean_squared_error

test_predictions = pd.Series(test_predictions.reshape(300,))
pred_df = pd.concat([pred_df,test_predictions],axis=1)
pred_df.columns = ['Test Y','Model Predictions']

mean_absolute_error(pred_df['Test Y'],pred_df['Model Predictions'])
mean_squared_error(pred_df['Test Y'],pred_df['Model Predictions'])

# And plot the error
pred_df['Error'] = pred_df['Test Y'] - pred_df['Model Predictions']
sns.distplot(pred_df['Error'],bins=50)

In [None]:
# Predicting new data is important use the same scaler to standarize
# important to shape the data when is only one sample (column to vector)
model.predict_classes(new_gem.values.reshape(1,78))

new_gem = scaler.transform(new_gem)
model.predict(new_gem)

#### Saving and Loading a Model

In [None]:
from tensorflow.keras.models import load_model
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
later_model = load_model('my_model.h5')


#### Early Stopping
Stop training when a monitored quantity has stopped improving.
 
* Arguments:
        monitor: Quantity to be monitored.
        min_delta: Minimum change in the monitored quantity
            to qualify as an improvement, i.e. an absolute
            change of less than min_delta, will count as no
            improvement.
        patience: Number of epochs with no improvement
            after which training will be stopped.
        verbose: verbosity mode.
        mode: One of `{"auto", "min", "max"}`. In `min` mode,
            training will stop when the quantity
            monitored has stopped decreasing; in `max`
            mode it will stop when the quantity
            monitored has stopped increasing; in `auto`
            mode, the direction is automatically inferred
            from the name of the monitored quantity.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', 
                           verbose=1, patience=25)
model.fit(x=X_train, 
          y=y_train, 
          epochs=600,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )

#### Early Stopping and Dropout

In [None]:
model = Sequential()
model.add(Dense(units=30,activation='relu'))
model.add(Dropout(0.5)) # Dropping 50% of the units randomly

model.add(Dense(units=15,activation='relu'))
model.add(Dropout(0.5)) # Dropping 50% of the units randomly

model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

## 4. Some visualization aids

In [None]:
# Scatter between 2 variables, with hue obtained from get_dummies ######
gg = final_data.plot(kind='scatter', y='F.Undergrad', x='Outstate',c=final_data['Private_Yes'], 
                        colormap ='coolwarm',colorbar=False,figsize=(10,10), grid=True)
gg.set(xlabel="Outstate", ylabel="F.Undergrad")

# ---- other alternative
sns.lmplot('Room.Board','Grad.Rate',data=df, hue='Private',
           palette='coolwarm',size=6,aspect=1,fit_reg=False)
########################################################################

# Histogram of 'Outstate' with hue of 'Private' using FacetGrid and controling the alpha
g = sns.FacetGrid(data,hue="Private",palette='viridis',height=6,aspect=2)
g = g.map(plt.hist,'Outstate',bins=20,alpha=0.5)
g.add_legend()

# Plot correlation of the target vs all other features
df.corr()['target'].sort_values().plot(kind='bar')

