# Python for (open) Neuroscience

_Lecture 3.0_ - Introduction to Statistics and Machine learning in Pyhton

Luigi Petrucco

Jean-Charles Mariani

## Classical statistics using Python

Disclaimer: I know very little about statistics!

No, Python will not tell you what tests to use...

...but `scipy` module provides functions for pretty much any kind of classical statistics you might want to compute!

### Descriptive statistics

#### Mean, median, mode

In [None]:
import numpy as np
import scipy.stats as stats

data = np.random.normal(0, 1, 1000)

print('Mean: {}'.format(np.mean(data)))
print('Median: {}'.format(np.median(data)))
print('Mode: {}'.format(stats.mode(data)[0][0]))

#### Standard deviation, variance

In [None]:
print('Standard deviation: {}'.format(np.std(data)))
print('Variance: {}'.format(np.var(data)))

#### Skewness, kurtosis (and other moments)

In [None]:
print('Skewness: {}'.format(stats.skew(data)))
print('Kurtosis: {}'.format(stats.kurtosis(data)))

#### Quantiles

In [None]:
print('Quantiles: {}'.format(stats.mstats.mquantiles(data)))

## Statistical tests

### One-sample tests

In [None]:
# One-sample t-test
stats.ttest_1samp(data, 0)


In [None]:
# One-sample Wilcoxon signed-rank test
stats.wilcoxon(data)

### Two-sample tests

#### Independent samples

# Two-sample t-test

In [None]:
data2 = np.random.normal(0, 1, 1000)
stats.ttest_ind(data, data2)

# Two-sample Wilcoxon rank-sum test

In [None]:
#Two-sample Wilcoxon rank-sum test
stats.ranksums(data, data2)

#### Paired samples

# Paired t-test

In [None]:
stats.ttest_rel(data, data2)

# Paired Wilcoxon signed-rank test

In [None]:
stats.wilcoxon(data, data2)

# One-way ANOVA

In [None]:
data = np.random.normal(0, 1, 1000)
data2 = np.random.normal(0, 1, 1000)
data3 = np.random.normal(-1, 1, 1000)
stats.f_oneway(data, data2, data3)

# Kolmogorov-Smirnov test

In [None]:
data_norm = np.random.normal(0, 1, 1000)
data_unif = np.random.uniform(0, 1, 1000)

stats.kstest(data_norm, 'norm')

### Correlation

In [None]:
data = np.random.normal(0, 1, 1000)
data2 = np.random.normal(0, 1, 1000)
stats.pearsonr(data, data2)

## Normality tests

In [None]:
# Test normality of data
data = np.random.normal(0, 1, 1000)
stats.normaltest(data)

# Curve fitting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Define the function to fit
def func(x, a, b, c):
    return a * np.exp(-b * x) + c

# Generate data
x = np.linspace(0, 4, 50)
ydata = func(x, 2.5, 1.3, 0.5) + 0.2 * np.random.normal(size=len(x))


In [None]:
# plot the data
plt.figure(figsize=(3, 3))
plt.scatter(x, ydata, label='data')

In [None]:
# Fit the data
popt, pcov = curve_fit(func, x, ydata)


In [None]:
plt.figure(figsize=(3, 3))
plt.scatter(x, ydata, label='data')
plt.plot(x, func(x, *popt), c="k", label='fit')

In [None]:
def func(x, a, b, c):
    return a * np.exp(-b * x) + c

In [None]:
pcov

(Practicals 3.2.0)

## Advanced statistics using statsmodels

The `statsmodels` module provides a more complete set of statistical tools, including:
- Linear models
- Generalized linear models
- Multivariate statistics
- ...

### Linear models

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

data = sm.datasets.get_rdataset('Guerry', 'HistData').data
# data = data[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()
data.head()

In [None]:
model = smf.ols('Lottery ~ Literacy + Wealth + Region', data=data)
results = model.fit()
print(results.summary())

In [None]:
import seaborn as sns

sns.regplot(x='Literacy', y='Lottery', data=data)

In [None]:
sns.pairplot(data, vars=['Lottery', 'Literacy', 'Wealth'])

### Generalized linear models

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

data = sm.datasets.get_rdataset('Guerry', 'HistData').data
data = data[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()



(Practicals 3.2.1)

# Machine learning using scikit-learn

## Data preprocessing

In [None]:
from sklearn import preprocessing
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

scaler = preprocessing.StandardScaler().fit(X_train)
scaler

In [None]:
X_scaled = scaler.transform(X_train)
X_scaled

Other scalers for max-min normalization ( MinMaxScaler), etc.

In [None]:
# Histogram equalization
data = np.random.normal(0, 1, 1000)
quantile_trasformer = preprocessing.QuantileTransformer(n_quantiles=1000)
trasf = quantile_trasformer.fit_transform(data.reshape(-1, 1))

In [None]:
trasf.shape

In [None]:
plt.figure()
plt.scatter(data, trasf)

# Dimensionality reduction

### Principal component analysis

In [None]:
from sklearn.decomposition import PCA

# A simple tw-dimension dataset:
from sklearn import datasets
iris = datasets.load_iris()

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

plt.figure()
plt.scatter(X[:, 0], X[:, 1])

In [None]:
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)

print(pca.singular_values_)

In [None]:
plt.figure(figsize=(3, 3)  )
plt.scatter(X[:, 0], X[:, 1])
plt.plot([0, pca.components_[0, 0]], [0, pca.components_[0, 1]], 'r')

In [None]:
transformed = pca.transform(X)

plt.figure(figsize=(3, 3))
plt.scatter(transformed[:, 0], transformed[:, 1])

# Splitting data into training and test sets

## Scikit-learn offers a function to split data into training and test sets:

In [None]:
from sklearn.datasets import make_blobs

from sklearn.model_selection import train_test_split
X, y = make_blobs(n_samples=50000, centers=5, n_features=5, random_state=0)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, stratify=y, random_state=50)

In [None]:
plt.figure(figsize=(3, 3))
plt.scatter(X_train[:, 0], X_train[:, 1], c=Y_train)

There's also tools to loop over multiple splits of the data:



In [None]:
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4],
              [0, 2], [1, 2], [5, 2], [0, 1]])
kf = KFold(n_splits=4)
kf.get_n_splits(X)
# print(kf)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

# Data clustering

In [None]:
import numpy as np

# Create two clusters of data and merge them into one dataset:
data1 = np.random.normal(0, 1, (100, 2))
data2 = np.random.normal(5, 1, (100, 2))

data = np.concatenate((data1, data2))

In [None]:
plt.figure()
plt.scatter(data[:, 0], data[:, 1], cmap="gray")

In [None]:
from sklearn.cluster import KMeans

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=2, n_init=10)
labels_predicted = kmeans.fit_predict(data)

In [None]:
plt.figure(figsize=(3, 3))

plt.scatter(trasf[:, 0], trasf[:, 1], c=labels_predicted)

(Practicals 3.2.2)