In [52]:
import json
import os
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
data_path = 'data/'
lots_path = data_path + 'lots.json'

if os.stat(lots_path).st_size > 0:
    with open(lots_path) as fp:
        lots = json.load(fp)
    
    
train = pd.DataFrame(lots)
print('loaded', lots_path, 'to dataframe train with', len(lots), 'entries')

loaded data/lots.json to dataframe train with 94364 entries


In [12]:
train.info()
train.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94364 entries, 0 to 94363
Data columns (total 24 columns):
artist_name               94364 non-null object
artist_name_normalized    94364 non-null object
auction_house_name        94364 non-null object
created_year              94216 non-null float64
currency                  94364 non-null object
description               94364 non-null object
essay                     94364 non-null object
exhibited_in              94364 non-null object
exhibited_in_museums      94364 non-null int64
height                    41712 non-null float64
id                        94364 non-null object
lot_id                    94364 non-null object
max_estimated_price       94364 non-null int64
min_estimated_price       94364 non-null int64
price                     94364 non-null int64
provenance                94364 non-null object
provenance_estate_of      94364 non-null object
sale_date                 94364 non-null object
sale_id                   943

(94364, 24)

In [37]:
print(train["description"][0])

AN EXCEPTIONALLY RARE AND IMPORTANT BLUE AND WHITE JAR, GUAN YUAN DYNASTY, MID 14TH CENTURY Robustly potted standing on a broad, low foot rising to a full rounded shoulder below a short cylindrical neck and a slightly thickened mouth rim, vividly painted in a deep and vibrant cobalt blue around the body with a narrative scene depicting a robed figure seated in a two-wheeled cart drawn by a tiger and a leopard, following two foot soldiers each carrying a spear, approaching a rustic bridge across a stream beneath a waterfall, the cart followed by two equestrians on either side of dramatically painted rocks, one in military attire and carrying a banner bearing the characters Gui Gu, the other in scholar's clothes, on a prancing piebald horse and turning towards the first horseman, the composition punctuated by pine, bamboo, flowering prunus, plantain, rose and willow, all between a classic wave band around the neck, a peony scroll around the shoulder, and a band of upright lappets enclosi

In [38]:

letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      train["description"][0] )  # The text to search
print(letters_only)

AN EXCEPTIONALLY RARE AND IMPORTANT BLUE AND WHITE JAR  GUAN YUAN DYNASTY  MID   TH CENTURY Robustly potted standing on a broad  low foot rising to a full rounded shoulder below a short cylindrical neck and a slightly thickened mouth rim  vividly painted in a deep and vibrant cobalt blue around the body with a narrative scene depicting a robed figure seated in a two wheeled cart drawn by a tiger and a leopard  following two foot soldiers each carrying a spear  approaching a rustic bridge across a stream beneath a waterfall  the cart followed by two equestrians on either side of dramatically painted rocks  one in military attire and carrying a banner bearing the characters Gui Gu  the other in scholar s clothes  on a prancing piebald horse and turning towards the first horseman  the composition punctuated by pine  bamboo  flowering prunus  plantain  rose and willow  all between a classic wave band around the neck  a peony scroll around the shoulder  and a band of upright lappets enclosi

In [42]:
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()               # Split into words
#import nltk
#nltk.download()  # Download text data sets, including stop words
#im jupyter terminal fenster python aufrufen, dann die oberen beinen kommandos
print(words)

['an', 'exceptionally', 'rare', 'and', 'important', 'blue', 'and', 'white', 'jar', 'guan', 'yuan', 'dynasty', 'mid', 'th', 'century', 'robustly', 'potted', 'standing', 'on', 'a', 'broad', 'low', 'foot', 'rising', 'to', 'a', 'full', 'rounded', 'shoulder', 'below', 'a', 'short', 'cylindrical', 'neck', 'and', 'a', 'slightly', 'thickened', 'mouth', 'rim', 'vividly', 'painted', 'in', 'a', 'deep', 'and', 'vibrant', 'cobalt', 'blue', 'around', 'the', 'body', 'with', 'a', 'narrative', 'scene', 'depicting', 'a', 'robed', 'figure', 'seated', 'in', 'a', 'two', 'wheeled', 'cart', 'drawn', 'by', 'a', 'tiger', 'and', 'a', 'leopard', 'following', 'two', 'foot', 'soldiers', 'each', 'carrying', 'a', 'spear', 'approaching', 'a', 'rustic', 'bridge', 'across', 'a', 'stream', 'beneath', 'a', 'waterfall', 'the', 'cart', 'followed', 'by', 'two', 'equestrians', 'on', 'either', 'side', 'of', 'dramatically', 'painted', 'rocks', 'one', 'in', 'military', 'attire', 'and', 'carrying', 'a', 'banner', 'bearing', 'the

stemming & lemmatizing according to: http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization

In [46]:
def review_to_words( review_text ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    # review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))  
    #
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5.a stemming
    porter_stemmer = PorterStemmer()
    stemmed_meaningfull_words = [porter_stemmer.stem(w) for w in meaningful_words]
    # 
    # lemmatize
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_stemmed_meaningfull_words = [wordnet_lemmatizer.lemmatize(w) for w in stemmed_meaningfull_words]

    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( lemmatized_stemmed_meaningfull_words ))   

In [47]:
clean_review = review_to_words( train["description"][0] )
print(clean_review)

except rare import blue white jar guan yuan dynasti mid th centuri robustli pot stand broad low foot rise full round shoulder short cylindr neck slightli thicken mouth rim vividli paint deep vibrant cobalt blue around bodi narr scene depict robe figur seat two wheel cart drawn tiger leopard follow two foot soldier carri spear approach rustic bridg across stream beneath waterfal cart follow two equestrian either side dramat paint rock one militari attir carri banner bear charact gui gu scholar cloth pranc piebald hors turn toward first horseman composit punctuat pine bamboo flower prunu plantain rose willow classic wave band around neck peoni scroll around shoulder band upright lappet enclos emblem around base hairlin crack shoulder cm diam wood stand fit box


In [48]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["description"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( train["description"][i] ) )
    
print("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_reviews )  )                                                                  
    clean_train_reviews.append( review_to_words( train["description"][i] ))

Cleaning and parsing the training set movie reviews...

Review 1000 of 94364

Review 2000 of 94364

Review 3000 of 94364

Review 4000 of 94364

Review 5000 of 94364

Review 6000 of 94364

Review 7000 of 94364

Review 8000 of 94364

Review 9000 of 94364

Review 10000 of 94364

Review 11000 of 94364

Review 12000 of 94364

Review 13000 of 94364

Review 14000 of 94364

Review 15000 of 94364

Review 16000 of 94364

Review 17000 of 94364

Review 18000 of 94364

Review 19000 of 94364

Review 20000 of 94364

Review 21000 of 94364

Review 22000 of 94364

Review 23000 of 94364

Review 24000 of 94364

Review 25000 of 94364

Review 26000 of 94364

Review 27000 of 94364

Review 28000 of 94364

Review 29000 of 94364

Review 30000 of 94364

Review 31000 of 94364

Review 32000 of 94364

Review 33000 of 94364

Review 34000 of 94364

Review 35000 of 94364

Review 36000 of 94364

Review 37000 of 94364

Review 38000 of 94364

Review 39000 of 94364

Review 40000 of 94364

Review 41000 of 94364

Review 420

In [50]:
clean_train_reviews

['except rare import blue white jar guan yuan dynasti mid th centuri robustli pot stand broad low foot rise full round shoulder short cylindr neck slightli thicken mouth rim vividli paint deep vibrant cobalt blue around bodi narr scene depict robe figur seat two wheel cart drawn tiger leopard follow two foot soldier carri spear approach rustic bridg across stream beneath waterfal cart follow two equestrian either side dramat paint rock one militari attir carri banner bear charact gui gu scholar cloth pranc piebald hors turn toward first horseman composit punctuat pine bamboo flower prunu plantain rose willow classic wave band around neck peoni scroll around shoulder band upright lappet enclos emblem around base hairlin crack shoulder cm diam wood stand fit box',
 'qi baishi monkey contempl peach inscrib sign one seal artistd xinwei year dedic shengquanpap ink x cm x th centuri',
 'qi baishi three plenti entitl inscrib sign one seal artistdat spring renshen year dedic shengquanscrol mou

In [53]:
print("Creating the bag of words...\n")


# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [56]:
print(train_data_features[1])

[0 0 0 ..., 0 0 0]


In [57]:
print (train_data_features.shape)

(94364, 5000)


In [58]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)

['aa', 'aaron', 'ab', 'abandon', 'abbey', 'abbot', 'abbott', 'abl', 'abpc', 'abraham', 'abramov', 'abras', 'absenc', 'absorb', 'abstract', 'academi', 'acanthu', 'accent', 'accept', 'access', 'accessori', 'accid', 'accompagn', 'accompani', 'accomplish', 'accord', 'account', 'accur', 'achiev', 'acid', 'acknowledg', 'acorn', 'acquaint', 'acquir', 'acquisit', 'acr', 'across', 'acryl', 'act', 'action', 'activ', 'actor', 'actual', 'ad', 'adam', 'adapt', 'add', 'addit', 'address', 'ade', 'adhes', 'adjust', 'administr', 'admir', 'adolph', 'adopt', 'ador', 'adress', 'adrian', 'advanc', 'adventur', 'advertis', 'advic', 'advis', 'ae', 'affair', 'affect', 'affich', 'affix', 'africa', 'african', 'afternoon', 'afterward', 'agat', 'age', 'agent', 'agit', 'agn', 'ago', 'agre', 'agreement', 'ah', 'ai', 'aid', 'aim', 'ainsi', 'air', 'ajout', 'al', 'alan', 'alarm', 'albert', 'alberto', 'album', 'albumen', 'aldin', 'aldu', 'aleksandr', 'aleksandrovich', 'aleksei', 'alexand', 'alexandr', 'alfa', 'alfr', 'a