In [3]:
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import pandas as pd 



ValueError: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 192 from PyObject

Suppose we have a table of employees, and we want to perform some analysis

In [None]:
d = {'name': ['Billy', 'Mandy', 'Courage', 'Blossom', 'Buttercup'], 'strength': [2.0, 3.0, 1.0, 20.0, 22.0], 'dexterity': [5.0, 8.0, np.nan, 6.0, 5.0], 'speed': ['low', 'low', 10.0, 31.0, 30.0], 'salary': ['$10,000', '$100,000', '$40,000', '$70,000', '$70,000']}
df = pd.DataFrame(data=d)
df

We note that the dexterity of Courage is Not a Number (NaN). We could proceed with one of the following options:


*   Delete the Courage row
*   Delete the dexterity column
*   Impute a value of dexterity for Courage

Lets impute a dexterity by taking the mean dexterity of the other employees.



In [None]:
df=df.fillna(df['dexterity'].mean())
df

We can see that Billy and Mandy have low speed. Let's replace 'low' with a numerical speed of 5

In [None]:
df['speed'] = df['speed'].replace('low', 5.0)
df

Now the salary is not saved as a number, but is saved as a string e.g '$10,000' so lets convert this into a number.

In [None]:
df['salary'] = df['salary'].str.replace(r'\D', '').astype(float)
df

Lets consider a different example. This time with a time series. Suppose we have two prices over time. The price of wheat, and the price of barley. For simplicity, we will create some fake data.


In [None]:
Fs = 24
sample = 240
time = np.arange(sample)

wheat = np.sin(2 * np.pi * time / Fs)
wheat += np.random.normal(loc=0.0, scale=0.4, size=(time.size,)) + 2
barley = 5*np.sin(2 * np.pi * time / Fs)
barley += np.random.normal(loc=0.0, scale=2.0, size=(time.size,)) + 10

plt.plot(time, wheat)
plt.plot(time, barley)
plt.xlabel('time (hours)')
plt.ylabel('price')
plt.show()

We can visualise the price distribution of wheat and barley with a box plot and whiskers diagram



In [None]:
data = np.transpose( np.vstack((wheat,barley)) )
fig, ax = plt.subplots()
ax.boxplot(data)
plt.xticks(np.arange(3), ['','wheat','barley'])
plt.ylabel('price')
plt.show()

We can also visualise the data with histograms

In [None]:
plt.hist(wheat,alpha = 0.8)
plt.hist(barley,alpha = 0.8)
plt.xlabel('price')
plt.ylabel('frequency')
plt.show()

Is the price of wheat and the price of barley correlated?

In [None]:
[m,c] = np.polyfit(wheat,barley,1)
plt.plot(wheat,m*wheat + c,'red')
plt.scatter(wheat,barley)
plt.xlabel('wheat')
plt.ylabel('barley')
plt.show()

print('The correlation between the price of wheat and barley is',np.corrcoef(wheat,barley)[0,1])

Lets look at the autocorrelation and partial autocorrelation functions for barley

In [None]:
plt.plot(time, wheat)
plt.xlabel('time (hours)')
plt.ylabel('price')
plot_acf(wheat,lags = 50)
plot_pacf(wheat,lags = 50)
plt.show()