In [39]:
import pandas as pd
import numpy as np
import scipy.stats as sps

In [57]:
# additional function applied
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), sps.sem(a)
    h = se * sps.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [69]:
data = pd.read_csv("cleaned.csv")

# drop unnecessary columns
data.drop(data.columns[0], axis=1, inplace=True)
data.drop(data.columns[-1], axis=1, inplace=True)

n_rows, n_features = data.shape

# basic
basic = data.describe().drop(['count'])

# range
data_range = pd.DataFrame(basic.loc['max'] - basic.loc['min'], columns=['range']).transpose()
description = basic.append(data_range)

# interquartile range
iqr = pd.DataFrame(basic.loc['75%'] - basic.loc['25%'], columns=['iqr']).transpose()
description = description.append(iqr)

# variance
variance = pd.DataFrame(data.var(), columns=['variance']).transpose()
description = description.append(variance)

# skewness
skewness = pd.DataFrame(data.skew(), columns=['skewness']).transpose()
description = description.append(skewness)

# kurtosis
kurtosis = pd.DataFrame(data.kurtosis(), columns=['kurtosis']).transpose()
description = description.append(kurtosis)

# 95% confidence interval
lower_confidence_interval, upper_confidence_interval = [], []

for i in range(n_features):
    column = data.iloc[:, i]
    _, lower, upper = mean_confidence_interval(column)
    lower_confidence_interval.append(lower)
    upper_confidence_interval.append(upper)

lower_confidence_interval = pd.DataFrame(lower_confidence_interval, columns=['lower ,95% confidence interval']).transpose()
lower_confidence_interval.columns = data.columns

upper_confidence_interval = pd.DataFrame(upper_confidence_interval, columns=['upper 95% confidence interval']).transpose()
upper_confidence_interval.columns = data.columns

description = description.append(lower_confidence_interval)
description = description.append(upper_confidence_interval)

# 5% trimming mean
tmean = [sps.tmean(data.iloc[:, i]) for i in range(n_features)]
tmean = pd.DataFrame(tmean, columns=['5% tmean']).transpose()
tmean.columns = data.columns

description = description.append(tmean)

description.to_csv('data-description.csv')