# Model Building

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc # garbage collector
from scipy.stats import norm

# Visualization
import seaborn as sns
color = sns.color_palette()
sns.set(style="darkgrid")

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18
plt.rcParams['patch.edgecolor'] = 'k'

%matplotlib inline

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)

# Always good to set a seed for reproducibility
SEED = 7
np.random.seed(SEED)

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 50

In [None]:
# check files
from subprocess import check_output
print(check_output(["ls", "../input/input/"]).decode("utf8"))

In [None]:
# Load Data
print("Loading data...")
train = pd.read_csv('../input/input/land_train.csv')
print("Train rows and columns", train.shape)
test = pd.read_csv('../input/input/land_test.csv')
print("Train rows and columns", test.shape)

## 1.  Remove Outliers 

In [None]:
def TurkyOutliers(df_out,nameOfFeature,drop=False):

    valueOfFeature = df_out[nameOfFeature]
    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(valueOfFeature, 25.)

    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(valueOfFeature, 75.)

    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3-Q1)*1.5
    # print "Outlier step:", step
    outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].index.tolist()
    feature_outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].values
    # df[~((df[nameOfFeature] >= Q1 - step) & (df[nameOfFeature] <= Q3 + step))]


    # Remove the outliers, if any were specified
    print ("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))
    if drop:
        good_data = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
        print ("New dataset with removed outliers has {} samples with {} features each.".format(*good_data.shape))
        return good_data
    else: 
        print ("Nothing happens, df.shape = ",df_out.shape)
        return df_out

In [None]:
# Remove Outliers from each columns

df_clean = TurkyOutliers(train,'X1',True)
df_clean = TurkyOutliers(train,'X2',True)
df_clean = TurkyOutliers(train,'X3',True)
df_clean = TurkyOutliers(train,'X4',True)
df_clean = TurkyOutliers(train,'X5',True)
df_clean = TurkyOutliers(train,'X6',True)

In [None]:
# Remove Outliers from each columns

df_clean_test = TurkyOutliers(test,'X1',True)
df_clean_test = TurkyOutliers(test,'X2',True)
df_clean_test = TurkyOutliers(test,'X3',True)
df_clean_test = TurkyOutliers(test,'X4',True)
df_clean_test = TurkyOutliers(test,'X5',True)
df_clean_test = TurkyOutliers(test,'X6',True)

Note : Removing outliers is not always a good idea. Depending on the problem - you treat outliers. We can leave the outliers but we would be limited to algorithms that are robust to outliers.

## 2. Remove Correlated Features

In [None]:
# Create correlation matrix
corr_matrix = df_clean.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]
to_drop

In [None]:
# saving the labels
y = train['target']

In [None]:
# dropping the correlated features

test = test.drop(columns = to_drop)
to_drop.append('target')
train = train.drop(columns = to_drop)

## 3. Scaling Features

In [None]:
from sklearn.preprocessing import StandardScaler
# Create a minimum and maximum processor object
scaler = StandardScaler()

# Create an object to transform the data to fit minmax processor
train_df = scaler.fit_transform(train)

In [None]:
test_df = scaler.transform(test)

In [None]:
# Run the normalizer on the dataframe
train_df = pd.DataFrame(train_df,columns=['X1','X4','X5','X6','I1','I2','I5','I6'])
# Run the normalizer on the dataframe
test_df = pd.DataFrame(test_df,columns=['X1','X4','X5','X6','I1','I2','I5','I6'])

## 4. Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score

# Model imports
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

In [None]:
features = list(train_df.columns)

### 4.1 Random Forest Model

In [None]:
%%time
model = RandomForestClassifier(n_estimators=100, random_state=10, 
                               n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_df, y, cv = 5, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

In [None]:
model.fit(train_df, y)

# Feature importances into a dataframe
feature_importances = pd.DataFrame({'feature': features, 'importance': model.feature_importances_})
feature_importances

In [None]:
# Plot feature importance
feature_importance = model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train_df.columns[sorted_idx])#boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
import warnings 
from sklearn.exceptions import ConvergenceWarning

# Filter out warnings from models
warnings.filterwarnings('ignore', category = ConvergenceWarning)
warnings.filterwarnings('ignore', category = DeprecationWarning)
warnings.filterwarnings('ignore', category = UserWarning)

# Dataframe to hold results
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

def cv_model(train, train_labels, model, name, model_results=None):
    """Perform 10 fold cross validation of a model"""
    cv_scores = cross_val_score(model, train, train_labels, cv = 10, scoring=scorer, n_jobs = -1)
    print(f'10 Fold CV Score: {round(cv_scores.mean(), 5)} with std: {round(cv_scores.std(), 5)}')
    
    if model_results is not None:
        model_results = model_results.append(pd.DataFrame({'model': name, 
                                                           'cv_mean': cv_scores.mean(), 
                                                            'cv_std': cv_scores.std()},
                                                           index = [0]),
                                             ignore_index = True)

        return model_results

### 4.2 Linear Support Vector Classifier

In [None]:
model_results = cv_model(train_df, y, LinearSVC(), 
                         'LSVC', model_results)

### 4.3 Gaussian Naive Bayes

In [None]:
model_results = cv_model(train_df, y, 
                         GaussianNB(), 'GNB', model_results)

### 4.4 Linear Discriminant Analysis

In [None]:
model_results = cv_model(train_df, y, 
                          LinearDiscriminantAnalysis(), 
                          'LDA', model_results)

### 4.5 K Nearest Neighbour

In [None]:
for n in [5, 10, 20]:
    print(f'\nKNN with {n} neighbors\n')
    model_results = cv_model(train_df, y, 
                             KNeighborsClassifier(n_neighbors = n),
                             f'knn-{n}', model_results)

### 4.6 Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model_results = cv_model(train_df, y, 
                         ExtraTreesClassifier(n_estimators = 100, random_state = 10),
                         'EXT', model_results)

### 4.7 Random Forest Classifier

In [None]:
model_results = cv_model(train_df, y,
                          RandomForestClassifier(150, random_state=10),
                              'RF', model_results)

In [None]:
model_results.set_index('model', inplace = True)
model_results['cv_mean'].plot.bar(color = 'orange', figsize = (8, 6),
                                  yerr = list(model_results['cv_std']),
                                 edgecolor = 'k', linewidth = 2)
plt.title('Model F1 Score Results');
plt.ylabel('Mean F1 Score (with error bar)');
model_results.reset_index(inplace = True)

In [None]:
model_results

In [None]:
clf = RandomForestClassifier(100, random_state = 10)
clf.fit(train_df,y) # Its always a good idea to use the whole training set.

In [None]:
# Predict of test data
predict = clf.predict(test_df) 

In [None]:
test_df['target'] = predict

In [None]:
test_df.head()

In [None]:
# Create the output file
# Naming the output file as model number - model used - estimators

test_df.to_csv("01-rf_100.csv")

In [None]:
# ! pip freeze > requirements.txt # To generate requirements file for reproducibility.


**Conclusion:**

* We achieved micro F1-score of 0.956 using random forest model. For more details check the brief documentation in socialcops/docs folder.
* Tree based models are more suitable for this problem.