# Matrix Processing and Numpy

In [1]:
import numpy
import json

In [2]:
path = "/Users/yaohanli/Downloads/yelp_dataset/review.json"
f = open(path)

In [3]:
dataset = []

In [4]:
while len(dataset) < 50000:
    dataset.append(json.loads(f.readline()))

In [6]:
dataset[0]

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

## Extracting basic statistics from the data

In [8]:
ratings = [d['stars'] for d in dataset]

In [9]:
cool = [d['cool'] for d in dataset]

In [10]:
funny = [d['funny'] for d in dataset]

In [11]:
useful = [d['useful'] for d in dataset]

In [12]:
ratings = numpy.array(ratings)
cool = numpy.array(cool)
funny = numpy.array(funny)
useful = numpy.array(useful)

In [13]:
ratings

array([1., 5., 5., ..., 4., 2., 5.])

In [15]:
numpy.mean(ratings)

3.74318

In [16]:
numpy.var(ratings)

2.1093034876

In [17]:
numpy.max(ratings)

5.0

In [19]:
numpy.stack([ratings, cool, funny]) # compose vectors to build ND-arrays

array([[1., 5., 5., ..., 4., 2., 5.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [20]:
features = numpy.stack([ratings, cool, funny]).T

In [21]:
features

array([[1., 0., 1.],
       [5., 0., 0.],
       [5., 0., 0.],
       ...,
       [4., 1., 0.],
       [2., 0., 0.],
       [5., 0., 0.]])

In [22]:
features = numpy.matrix(features)

In [23]:
features

matrix([[1., 0., 1.],
        [5., 0., 0.],
        [5., 0., 0.],
        ...,
        [4., 1., 0.],
        [2., 0., 0.],
        [5., 0., 0.]])

In [24]:
features.T * features

matrix([[806035., 108469.,  74872.],
        [108469., 205639., 124459.],
        [ 74872., 124459., 129252.]])

In [25]:
numpy.linalg.inv(features.T * features)

matrix([[ 1.33825270e-06, -5.67360756e-07, -2.28890104e-07],
        [-5.67360756e-07,  1.18961914e-05, -1.11263938e-05],
        [-2.28890104e-07, -1.11263938e-05,  1.85832119e-05]])

## Numpy allows matrics to be used within complex mathematical expressions, in order to perform transformations of our data:

In [26]:
2*numpy.sin(features) + 3

matrix([[4.68294197, 3.        , 4.68294197],
        [1.08215145, 3.        , 3.        ],
        [1.08215145, 3.        , 3.        ],
        ...,
        [1.48639501, 4.68294197, 3.        ],
        [4.81859485, 3.        , 3.        ],
        [1.08215145, 3.        , 3.        ]])

In [27]:
2*numpy.sin(features) + 3 > 4

matrix([[ True, False,  True],
        [False, False, False],
        [False, False, False],
        ...,
        [False,  True, False],
        [ True, False, False],
        [False, False, False]])

# Other features:
- ndarray.shape: Get the shape of an array
- reshape: change the dimensions of an array/matrix
- arange: Create an array containing a range of numbers
- numpy.random: generate (arrays of) random numbers
- sum, min, max, etc.: reduction operations on matrices
- eye: identity matrix
- trace, eig, etc: linear algebra operations
- See more: https://docs.scipy.org/doc/numpy/user/quickstart.html

In [28]:
numpy.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [29]:
numpy.eye(3).dtype

dtype('float64')