![logo](1_bDwEvCRgrKVbLrAXEixpfA.png)
___

##### importing libraries

In [10]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline 
sns.set(color_codes=True)

import functools as ft

#bag of words model
from sklearn.feature_extraction.text import CountVectorizer
import re

#natural language processing
#pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#classification 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vlad_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 5 - Analysis II: NLP with outliers 
    a) Importing Data
    b) Success-Percentage Column
    c) Natural Language Processing

## a) Importing data

In [2]:
#import .csv file
cleaned_df = pd.read_csv("cleaned_data.csv")

cleaned_df.drop(['Unnamed: 0'], axis=1, inplace=True)

print(cleaned_df.shape)
print(cleaned_df['state'].value_counts())
cleaned_df.head(1)

(149563, 21)
successful    93849
failed        55714
Name: state, dtype: int64


Unnamed: 0,id,name,genre,subgenre,category,source_url,blurb,slug,goal,converted_pledged_amount,...,launched_at,deadline,country,currency,backers_count,disable_communication,is_starrable,spotlight,staff_pick,state
0,498799566,"Strange Wit, an original graphic novel about J...",Graphic Novels,comics,"{""id"":252,""name"":""Graphic Novels"",""slug"":""comi...",https://www.kickstarter.com/discover/categorie...,"The true biography of the historical figure, w...",strange-wit-an-original-graphic-novel-about-ja...,12000,14740,...,2015-08-15 04:19:27,2015-09-14 04:19:27,US,USD,403,0,0,1,1,successful


### b) Success-Percentage Column
Make a new column for success % (goal amount / pledged amount)

In [3]:
#create column
cleaned_df['success_percentage']=(cleaned_df['converted_pledged_amount'] / cleaned_df['goal']) * 100

In [5]:
#finding the missing and infinite values
print('null:', cleaned_df['success_percentage'].isnull().sum())
print('na:', cleaned_df['success_percentage'].isna().sum())
print('\n')
print('inf:', np.isinf(cleaned_df["success_percentage"]).value_counts())

null: 0
na: 0


inf: False    149562
True          1
Name: success_percentage, dtype: int64


In [6]:
#locate the row in which the inf value is present (last column == 'inf')
print(cleaned_df.index[np.isinf(cleaned_df['success_percentage'])])
print('\n')
print('Row 118304 "success_percentage" value:', cleaned_df['success_percentage'].iloc[118304])

Int64Index([118304], dtype='int64')


Row 118304 "success_percentage" value: inf


In [7]:
#drop the row with inf value
cleaned_df.drop(cleaned_df.index[118304], inplace=True)

#reset index
cleaned_df.reset_index(drop=True,inplace=True)

## total number of rows and columns
print('dataframe shape: ', cleaned_df.shape)

dataframe shape:  (149562, 22)


### c) Natural Language Processing

##### Clean the first row to check

In [11]:
#create 'text' variable of every blurb description
text = pd.DataFrame(cleaned_df['blurb'])

#lets clean the first blurb as a first step
text['blurb'][0]

'The true biography of the historical figure, writer, alcoholic, lesbian, and world traveler: Jane Sydney Auer Bowles.'

In [12]:
#only keep letters and replace other symbols with a white space in the first blurb
blurb = re.sub('[^a-zA-Z]', ' ', text['blurb'][0])

#change letters to lower-case
blurb = blurb.lower()   

#split the first blurb which is a string into list
blurb = blurb.split()

#for loop to remove stop-words and to do lemmatization
wn = WordNetLemmatizer()
blurb = [wn.lemmatize(word) for word in blurb if not word in set(stopwords.words('english'))]

#join blurb back into a string from a list
blurb = ' '.join(blurb)

blurb

'true biography historical figure writer alcoholic lesbian world traveler jane sydney auer bowles'

### Clean the entire 'blurb' column

In [15]:
length = len(pd.DataFrame(cleaned_df['blurb']))
length

149562

In [16]:
corpus = []
for i in range(0, length):
    #only keep letters and replace other symbols with a white space in the first blurb
    blurb = re.sub('[^a-zA-Z]', ' ', text['blurb'][i])

    #change letters to lower-case
    blurb = blurb.lower()   
    #split
    blurb = blurb.split()
    #for loop to remove stop-words and to do stemming
    wn = WordNetLemmatizer()
    blurb = [wn.lemmatize(word) for word in blurb if not word in set(stopwords.words('english'))]
    #join blurb back into a string from a list
    blurb = ' '.join(blurb)
    corpus.append(blurb)

In [17]:
#write corpus array into a dataframe and save it to a csv for importing into the lassification)
corpus_df = pd.DataFrame(corpus, columns = ['blurb'])
corpus_df.head()

Unnamed: 0,blurb
0,true biography historical figure writer alcoho...
1,graphic novel two magical lady love
2,publishing magazine focus folklore occult para...
3,educating community self sufficiency building ...
4,rewarding thing event attend


### End of Step 5

In [18]:
#write corpus_df to a .csv
corpus_df.to_csv('corpus_data_w_outliers.csv')

In [19]:
#write cleaned cleaned_df to a .csv
cleaned_df.to_csv('cleaned2_data_w_outliers.csv')