In [1]:
import numpy as np

x = np.array([3,4,5])
y = np.array([4,9,7])
x + y

array([ 7, 13, 12])

In [2]:
x = np.array([[1,2], [3,4]])
print(x.ndim) # output will show that it's a two-dimensional array.
print(x.shape) # shows how many rows and columns the array has.

# x.dtype to see what datatype x is. only integers, if we write decimals we will get float-values.

y = np.array([[1,2], [3.0,4]]).dtype
print(y)


2
(2, 2)
float64


In [3]:
# sum
x = np.array([1,2,3,4])
print(x.sum())


10


In [4]:
# reshaping
x = np.array([1,2,3,4,5,6])
print("Beginning of x:\n", x)
x_reshape = x.reshape((2,3))
print("Reshaped x:\n", x_reshape)

print(x_reshape[0,0])
print(x_reshape[1,2])


Beginning of x:
 [1 2 3 4 5 6]
Reshaped x:
 [[1 2 3]
 [4 5 6]]
1
6


In [5]:
print("x before we modify x_reshape:\n", x)
print("x_reshape before we modify x_reshape:\n", x_reshape)
x_reshape[0,0] = 5
print("x_reshape after we modify it's top left element:\n", x_reshape)
print("x after we modify top left element of x_reshape:\n", x)

x before we modify x_reshape:
 [1 2 3 4 5 6]
x_reshape before we modify x_reshape:
 [[1 2 3]
 [4 5 6]]
x_reshape after we modify it's top left element:
 [[5 2 3]
 [4 5 6]]
x after we modify top left element of x_reshape:
 [5 2 3 4 5 6]


In [6]:
print(x_reshape.shape, x_reshape.ndim, x_reshape.T)

print(np.sqrt(x))

x**2

(2, 3) 2 [[5 4]
 [2 5]
 [3 6]]
[2.23606798 1.41421356 1.73205081 2.         2.23606798 2.44948974]


array([25,  4,  9, 16, 25, 36])

In [7]:
x = np.random.normal(size=50)

y = x + np.random.normal(loc=50, scale=1, size=50)

np.corrcoef(x,y) # correlation-matrix

print(np.random.normal(scale=5, size=2))
print(np.random.normal(scale=5, size=2))

[-5.52992756 -1.30451377]
[-9.11959751  1.23856856]


In [8]:
# for calculations generating random quantities. rng = random number generation. rng.normal = normal data
rng = np.random.default_rng(1303)
print(rng.normal(scale=5, size=2))
rng2 = np.random.default_rng(1303)
print(rng2.normal(scale=5, size=2))

[ 4.09482632 -1.07485605]
[ 4.09482632 -1.07485605]


### functions for computing mean,variance and standard deviation of arrays.

In [9]:
rng = np.random.default_rng(3)
y = rng.standard_normal(10)
np.mean(y), y.mean()

(-0.1126795190952861, -0.1126795190952861)

In [10]:
# variance
np.var(y), y.var(), np.mean((y - y.mean())**2)

(2.7243406406465125, 2.7243406406465125, 2.7243406406465125)

In [11]:
print(np.sqrt(np.var(y)), np.std(y))
X = rng.standard_normal((10,3))
print(X)

X.mean(axis=0) # <- same thing as X.mean(0)

1.6505576756498128 1.6505576756498128
[[ 0.22578661 -0.35263079 -0.28128742]
 [-0.66804635 -1.05515055 -0.39080098]
 [ 0.48194539 -0.23855361  0.9577587 ]
 [-0.19980213  0.02425957  1.54582085]
 [ 0.54510552 -0.50522874 -0.18283897]
 [ 0.54052513  1.93508803 -0.26962033]
 [-0.24355868  1.0023136  -0.88645994]
 [-0.29172023  0.88253897  0.58035002]
 [ 0.0915167   0.67010435 -2.82816231]
 [ 1.02130682 -0.95964476 -1.66861984]]


array([ 0.15030588,  0.14030961, -0.34238602])

### Sequences and slice notation


In [12]:
seq1 = np.linspace(0,10,11)
print(seq1)

seq2 = np.arange(0,10)
print(seq2)

print("hello world"[3:6]) # same as "hello world"[slice(3:6)]

A = np.array(np.arange(16)).reshape((4,4))
print(A, "\n")
print(A[1,2], "\n")
print(A[[1,3]], "\n")
print(A[:,[0,2]],"\n")
print(A[[1,3], [0,2]],"\n")

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
[0 1 2 3 4 5 6 7 8 9]
lo 
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]] 

6 

[[ 4  5  6  7]
 [12 13 14 15]] 

[[ 0  2]
 [ 4  6]
 [ 8 10]
 [12 14]] 

[ 4 14] 



In [13]:
idx = np.ix_([1,3], [0,2,3])
print(A[idx],"\n")

print(A[1:4:2,0:3:2],"\n")

# Boolean indexing
keep_rows = np.zeros(A.shape[0], bool)
print(keep_rows,"\n")

keep_rows[[1,3]] = True
print(keep_rows,"\n")

np.all(keep_rows == np.array([0,1,0,1]))
A[np.array([0,1,0,1])]
print(A[keep_rows]) # retrievs only the second and fourth row of A, ie, the rows with Boolean = True

keep_cols = np.zeros(A.shape[1], bool)
keep_cols[[0,2,3]] = True
idx_bool = np.ix_(keep_rows,keep_cols)
A[idx_bool]



[[ 4  6  7]
 [12 14 15]] 

[[ 4  6]
 [12 14]] 

[False False False False] 

[False  True False  True] 

[[ 4  5  6  7]
 [12 13 14 15]]


array([[ 4,  6,  7],
       [12, 14, 15]])

### Loading Data

In [14]:
import pandas as pd
Auto = pd.read_csv("Auto.csv")
Auto


FileNotFoundError: [Errno 2] No such file or directory: 'Auto.csv'

In [159]:
Auto = pd.read_csv("Auto.data", na_values=["?"] ,delim_whitespace=True)

# Auto["horsepower"]
# np.unique(Auto["horsepower"])
print(Auto["horsepower"].sum())
print(Auto.shape) # rows,variables, or columns

Auto_new = Auto.dropna() # removing(dropping) missing values
print(Auto_new.shape)

Auto = Auto_new # overwrite the old data
print(Auto.columns)

Auto[:3]

40952.0
(397, 9)
(392, 9)
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')


  Auto = pd.read_csv("Auto.data", na_values=["?"] ,delim_whitespace=True)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite


In [None]:
idx_80 = Auto["year"] > 80
# print(Auto[idx_80])

Auto[["mpg", "horsepower"]]
print(Auto.index)

Auto_re = Auto.set_index("name")
print(Auto_re)
# Auto_re.columns with the code above we removed "name" column.

rows = ["amc rebel sst", "ford torino"]
print(Auto_re.loc[rows])
# Alternative to retrieving indexes.
# print(Auto_re.iloc[[3,4]]) 
# print(Auto_re.iloc[:,[0,2,3]])
# print(Auto_re.iloc[[3,4], [0,2,3]])




In [None]:
Auto_re.loc["ford galaxie 500", ["mpg", "origin"]]

idx_80 = Auto_re["year"] > 80
Auto_re.loc[idx_80, ["weight", "origin"]]

Auto_re.loc[lambda df: df["year"] > 80, ["weight", "origin"]]


In [None]:
Auto_re.loc[lambda df: (df["year"] > 80) & (df["mpg"] > 30), ["weight", "origin"]]

In [None]:
Auto_re.loc[lambda df: (df["displacement"] < 300) & 
(df.index.str.contains("ford") | df.index.str.contains("datsun")),["weight", "origin"]]

### For loops

In [180]:
total = 0 
for value in [3,2,19]:
    total += value
print("Total is: {0}".format(total))


total = 0
for value in [2,3,19]:
    for weight in [3,2,1]:
        total += value * weight
print("Total is: {0}".format(total))


total = 0
for value, weight in zip([2,3,19], [0.2, 0.3, 0.5]):
    total += weight * value
print("Weighted average is: {0}".format(total))



Total is: 24
Total is: 144
Weighted average is: 10.8


### String formatting

In [184]:
rng = np.random.default_rng(1)
A = rng.standard_normal((127, 5))
M = rng.choice([0, np.nan], p=[0.8, 0.2], size= A.shape)
A += M
D = pd.DataFrame(A, columns= ["food", "bar", "pickle", "snack", "popcorn"])

print(D[:3])

       food       bar    pickle     snack   popcorn
0  0.345584  0.821618  0.330437 -1.303157       NaN
1       NaN -0.536953  0.581118  0.364572  0.294132
2       NaN  0.546713       NaN -0.162910 -0.482119


In [185]:
for col in D.columns:
    template = 'Column "{0}" has {1:.2%} missing values'
    print(template.format(col, np.isnan(D[col]).mean()))

Column "food" has 16.54% missing values
Column "bar" has 25.98% missing values
Column "pickle" has 29.13% missing values
Column "snack" has 21.26% missing values
Column "popcorn" has 22.83% missing values


### Additional Graphical and numerical summaries