In [1]:
# For this exercise, I will walk through this article linked below: 
# https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

In [67]:
import pandas as pd # Uploading pandas dataframe
import re, string, unicodedata # Removing punctuation, converting rows into string
import nltk # Natural language processing library
import contractions # Parses contractions
import inflect # Generating plurals, singulars, numbers to words
from bs4 import BeautifulSoup # online scraper
from nltk import word_tokenize, sent_tokenize #Tokenization of words, sentences
from nltk.stem import LancasterStemmer, WordNetLemmatizer #Lemmatization of words
from nltk import punkt # Needed to tokenize words
from nltk.corpus import stopwords # Needed for normalization
from nltk.corpus import wordnet # Find meaning of words, synonyms, antonyms.
from sklearn.feature_extraction.text import CountVectorizer #Gets counts of words for sparse matrix
from scipy import sparse # Saves sparse matrix for later

In [57]:
# Read in kickstarter data as csv

kickstarter = pd.read_csv(r'kickstarter.csv', index_col = 0)

In [10]:
# Operating on the text columns, name, and blurb.

text = kickstarter_text['name'] + ' ' + kickstarter_text['blurb']

In [11]:
# Formatting everything as a string

to_string = text.astype(str)

In [23]:
# Replacing contractions.

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)


In [14]:
# Replacing contractions

without_contractions = to_string.map(lambda x: replace_contractions(x))

In [18]:
# Applying word tokenization

tokens = without_contractions.map(lambda x: nltk.word_tokenize(x))

In [36]:
# Normalization

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Taking out replacing numbers because not important for analysis.
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words


In [22]:
# Normalizing words using the functions below.

normalized = tokens.map(lambda x: normalize(x))

In [None]:
# Lemmatizing verbs: changing verbs to infinite form 

lemmatized = normalized.map(lambda x: lemmatize_verbs(x))

In [64]:
# Saving the lemmatized version to a CSV to read in later.

lemmatized.to_csv('lemmatized.csv', index=False)

In [9]:
# Reads in csv from above to continue processing.

df = pd.read_csv('lemmatized.csv', names=['list of words'], header=None)
lemmatized = df[1:]

In [10]:
# Get only frequent words to remove product names and other specific words

lemmatized.head()

Unnamed: 0,list of words
1,"['remix', 'explore', 'paint', 'place', 'digita..."
2,"['photo', 'grant', 'hazel', 'eat', 'cake', 'ba..."
3,"['minecraft', 'digital', 'artanimations', 'nee..."
4,"['sacramento', 'nature', 'photograph', 'series..."
5,"['north', 'africa', 'art', 'project', 'dream',..."


In [28]:
# Gets rid of numbers

lemmatized['no_numbers'] = lemmatized['list of words'].apply(lambda x: re.sub('(\d)+', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
#Gets rid of rare words

vectorizer = CountVectorizer(min_df=.001) 
X = vectorizer.fit_transform(lemmatized['no_numbers'])
y = kickstarter['binary_state']

In [66]:
# Saves matrix for machine learning in other notebook. Comment is to read in matrix.

sparse.save_npz("kickstarter.npz", X)
# your_matrix_back = sparse.load_npz("kickstarter.npz")

AttributeError: vocabulary not found