In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pylab as pl # linear algebra + plots
from scipy.sparse import hstack, vstack
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Reading data and preprocessing it
In this kernel we will be dealing with 3 kinds of data: numerical, categorical and text

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
test.teacher_number_of_previously_posted_projects = test.teacher_number_of_previously_posted_projects.map(int)
resources = pd.read_csv("../input/resources.csv")

In [None]:
numerical_cols = []
categorical_cols = []
text_cols = []

In [None]:
train["origin"] = "train"
test["origin"] = "test"
train_test = pd.concat([train, test])
train_test.head(3)

In [None]:
resources.head()

In [None]:
#Many rows have null values in columns 3 & 4. Thus wherever the data is present in all 4 columns of project_text then we will bring it to 2 columns and drop the remaining columns
proj_flag = ~(train_test.project_essay_3.isnull() & train_test.project_essay_4.isnull())
train_test[proj_flag] = (train_test[proj_flag]
                 .assign(project_essay_1 = lambda df: df.project_essay_1 + df.project_essay_2)
                 .assign(project_essay_2 = lambda df: df.project_essay_3 + df.project_essay_4))
train_test = train_test.drop(['project_essay_3', 'project_essay_4'], axis=1)

## Null value treatment

In [None]:
train_test.isnull().sum()[train_test.isnull().sum() > 0]

We find that there are null values in teacher_prefix. The other column is ignored as that is the target variable. 

In [None]:
#As teacher_prefix is a categorical variable, we will use mode of that variable to replace the missing values
max_count = train_test.teacher_prefix.value_counts().idxmax()
train_test.teacher_prefix = train_test.teacher_prefix.fillna(max_count)
train_test.isnull().sum()[train_test.isnull().sum() > 0]

In [None]:
resources.isnull().sum()[resources.isnull().sum() > 0]

In [None]:
#As description is a text variable, we will simply replace the missing values with X
resources["description"] = resources["description"].fillna('X')

## Feature engineering - 1. Resources dataset (Numerical features)
The 2nd cell below this where we create features based on descriptive stats(min, max, mean) was inspired from other kernels and was found to have a good impact on the accuracy 

In [None]:
resources["desc_len"] = resources.description.str.len()
resources["total_price"] = resources.quantity * resources.price
resources.head(3)

In [None]:
def concatenate(series):
    return ' '.join(map(str, series))

resources_info = resources.groupby('id').agg({'description': [pd.Series.nunique, concatenate],
                             'quantity': [np.sum],
                             'price': [np.sum, np.mean], 
                             'desc_len': [np.mean, np.min, np.max], 
                             'total_price': [np.mean, np.min, np.max]})

In [None]:
resources_info.head(5)

Looks like resources_info is a nested dataframe. We will flatten the dataframe

In [None]:
resources_info.columns.values

In [None]:
resources_info.columns = ['_'.join([col, func]) for col, func in resources_info.columns.values]
resources_info = resources_info.reset_index()
resources_info.head()

In [None]:
#Adding column names to their respective variable lists
numerical_cols += list(resources_info.columns)
numerical_cols.remove('id')
numerical_cols.remove('description_concatenate')
text_cols+=['description_concatenate']
numerical_cols

We will join train_test with resources_info on project_id

In [None]:
train_test = train_test.merge(resources_info, how="left", left_on="id", right_on="id")
train_test.head(3)

## Feature Engineering - 2. Categorical data
Creating dummy variables for categorical data[](http://)

In [None]:
train_test['submitted_month']= pd.DatetimeIndex(train_test['project_submitted_datetime']).month
train_test['submitted_month'] = train_test['submitted_month'].apply(str)
dummy_colnames = ['teacher_prefix', 'submitted_month', 'school_state', 'project_grade_category']
dummies = pd.get_dummies(train_test.loc[:, dummy_colnames])
train_test = pd.concat([train_test, dummies], axis=1)
train_test.head(1)

In [None]:
categorical_cols += list(dummies.columns)

Columns like project_subject_categories and sub_Categories have a lot of combinations of categories separated by commas. We will create one hot encoding for these columns and where ever the value exists the column will have value = 1 

In [None]:
def return_unique_elements(col_name):
    unique_elements = []
    categories = train_test[col_name].unique().tolist()
    for s in categories:
        temp_str = [i.strip() for i in s.split(',')] #It splits the string by comma and returns a list. The whitespace from list elements is then removed
        unique_elements+=temp_str
    return set(unique_elements)

unique_categories = return_unique_elements('project_subject_categories')
unique_subcategories = return_unique_elements('project_subject_subcategories')

total_categories = list(unique_subcategories.union(unique_categories))
total_categories

In [None]:
#train_test_sample.project_subject_categories.str.contains('Health & Sports') | train_test_sample.project_subject_subcategories.str.contains('Health & Sports')

for category in total_categories:
    train_test[category] = np.where(train_test.project_subject_categories.str.contains(category) | train_test.project_subject_subcategories.str.contains(category), 1, 0)
train_test.head(1)

In [None]:
categorical_cols += total_categories

In [None]:
%%time

train_test_sample = train_test.iloc[:1000, :]
# vectorizer = CountVectorizer(stop_words=None,
#                                  max_features=1000,
#                                  binary=True,
#                                  ngram_range=(1,2))
# X = vectorizer.fit_transform(train_test['project_essay_1'])

In [None]:
# %%time
# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=1000)
# X = tfidf.fit_transform(train_test['project_essay_1'])

In [None]:
# #Getting the tockenized data into dataframe
# df1 = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names())
# df1.columns = ['project_essay_1_'+col for col in df1.columns.values]
# df1.head()

In [None]:
len(numerical_cols) + len(categorical_cols)

## Feature Engineering - 3. Text data
Here we use TF-IDF Vectorizer method. We create a dictionary of 2000 most commonly occurring words and normalize them by L2 norm. These 2000 words will serve as an individual feature in the dataset. We considered 4 text variables:  project_essay_1, project_essay_2, project_resource_summary, project_title. 

In [None]:
def text_features(col_name):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=2000)
    X = tfidf.fit_transform(train_test[col_name])
    return X, tfidf.get_feature_names()

In [None]:
%%time
txt_essay_1, essay_1_features = text_features('project_essay_1')
txt_essay_2, essay_2_features = text_features('project_essay_2')
txt_summary, summary_features = text_features('project_resource_summary')
txt_title, title_features = text_features('project_title')

As these text features will have a lot of 0's they are stored in a sparse matrix. We then stack them horizontally to create a sparse matrix with 8000 features.

In [None]:
# categorical_data_array = train_test[categorical_cols].values
# txt_essay_1.shape
X = hstack((train_test[numerical_cols].values, train_test[categorical_cols].values, txt_essay_1, txt_essay_2, txt_summary, txt_title)).tocsr()

In [None]:
X

In [None]:
#X_train = X[train_test["origin"] == 'train', :] This didn't work as my kernel died due to RAM overflow
X_train = X[pl.find(train_test["origin"] == 'train'), :]
X_test = X[pl.find(train_test["origin"] == 'test'), :]

Now that train and test features are created our next task is to have a look at number of approvals and rejections. We will also try to create a balanced dataset that is 50% approvals and 50% rejections. Once the balanced dataset is ready we can proceed to modelling where we will start with logistic regression. 

In [None]:
#Indices of train dataset to create a balanced dataset
rejection_index = np.array(train[train.project_is_approved == 0].index)
approval_index = np.array(train[train.project_is_approved == 1].index)
approval_permuted_index = np.random.permutation(approval_index)[:rejection_index.shape[0]]

#Once we have equal 0's and 1's we will join the indices of the 2 values and then randomly permute them
balanced_indices = np.concatenate((approval_permuted_index, rejection_index))
balanced_permuted_indices = np.random.permutation(balanced_indices)

#We create X and Y arrays holding the training and testing data respectively
X_balanced_data = X_train[balanced_permuted_indices, :]
Y_balanced = train.project_is_approved[balanced_permuted_indices].values

## Modeling and Validating data
Fitting a logistic regression model on balanced dataset

In [None]:
# %%time
# logistic = linear_model.LogisticRegression()
# model = logistic.fit(X_balanced_data, Y_balanced)
# print(model.score(X_balanced_data, Y_balanced))

In [None]:
%%time 
logistic = linear_model.LogisticRegression(penalty="l2", C=0.18285)
model = logistic.fit(X_balanced_data, Y_balanced)
print(model.score(X_balanced_data, Y_balanced))

In [None]:
Y_predicted = model.predict_proba(X_balanced_data)[:, 1]
roc_auc_score(Y_balanced, Y_predicted)

In [None]:
test.shape

In [None]:
%%time
Y = train.project_is_approved.values
logistic = linear_model.LogisticRegression(penalty="l2", C=0.18285)
model = logistic.fit(X_train, Y)
Y_predicted = model.predict_proba(X_train)[:, 1]
print(roc_auc_score(Y, Y_predicted))
print(model.score(X_train, Y))

In [None]:
Y_pred_test = model.predict_proba(X_test)[:, 1]
test_output = pd.DataFrame({'id':test.id.values, 'project_is_approved':Y_pred_test})

In [None]:
#test_output.to_csv('csv_to_submit.csv', index = False)

In [None]:
# %%time
# from sklearn.model_selection import GridSearchCV
# # Dictionary with parameters names to try during search
# # We tried a lot of parameters, you may uncomment the code an experiment
# param_grid = {"C": np.linspace(0.24285-0.1, 0.24285+0.1, num=6)
#              # "union__numerical_pipe__logtransform__alpha": [0.8, 1],
#              # "union__text_pipe__tf_idf__stop_words": [None, 'english']
#              }
# logistic = linear_model.LogisticRegression()

# # run randomized search
# grid_search = GridSearchCV(logistic, param_grid=param_grid,
#                                     scoring='roc_auc',
#                                     n_jobs=1,
#                                     verbose=1,
#                                     cv=3)
# best_model = grid_search.fit(X_balanced_data, Y_balanced)


In [None]:
# best_model.best_estimator_

Random Forest 

In [None]:
# %%time
# from sklearn.ensemble import RandomForestClassifier
# rf_model = RandomForestClassifier(n_estimators=250, min_samples_split=10, max_features="auto", random_state=0)
# rf_model = rf_model.fit(X_balanced_data, Y_balanced)
# Y_predicted = rf_model.predict_proba(X_balanced_data)[:, 1]
# roc_auc_score(Y_balanced, Y_predicted)

In [None]:
# from sklearn.model_selection import cross_val_score
# # rf_model = RandomForestClassifier(n_estimators=500, max_depth=5, max_features="auto", random_state=0)
# # rf_model = rf_model.fit(X_balanced_data, Y_balanced)
# # Y_predicted = rf_model.predict_proba(X_balanced_data)[:, 1]
# CV = 5
# Y = train.project_is_approved.values
# rf_model = RandomForestClassifier(n_estimators=500, max_depth=5, max_features="auto", random_state=0)
# accuracies = cross_val_score(rf_model, X_train, Y, scoring='roc_auc', cv=CV)

In [None]:
# cross_val_score

We will now be doing GridSearch based on cross validation by varying the hyperparameters to choose the best possible model

In [None]:
# %%time
# # Create hyperparameters range
# penalty = ['l1', 'l2']
# C = np.linspace(0.1, 25, num=15)
# hyperparameters = dict(C=C, penalty=penalty)

# # Create logistic regression
# logistic = linear_model.LogisticRegression()

# # Create grid search using 5-fold cross validation
# clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0, scoring='roc_auc')

# # Fit grid search
# best_model = clf.fit(X_balanced_data, Y_balanced)