In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os

In [None]:
df = pd.read_csv('veda_intermareal/normalizado.csv')
df

In [None]:
df.describe().to_csv("description.csv")
df.describe()

In [None]:
# Shapiro-Wilk Test
# normality test
from scipy.stats import shapiro

data = df['M.O']

plt.hist(data)

stat, p = shapiro(data)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

In [None]:
#Normalization by Simple Feature Scaling
df["Biomasa_normal"] = df["Biomasa"]/df["Biomasa"].max()
#Normalization by Min-max
df["Biomasa_normal"] = (df["Biomasa"]-df["Biomasa"].min())/(df["Biomasa"].max()-df["Biomasa"].min())
#Normalization by Z-score
df["Biomasa_normal"] = (df["Biomasa"]-df["Biomasa"].mean())/df["Biomasa"].std()

In [None]:
#Normalization by log or raiz
from math import sqrt

df["Biomasa"] = np.log(df['Biomasa'])
df["Abundancia"] = np.log(df['Abundancia'])
df["Equidad de Pielou"] = np.log(df['Equidad de Pielou'])


df.to_csv("normalizado.csv")
df

In [None]:
from math import sqrt
sqrt(a)

In [None]:
# Shapiro-Wilk Test
# normality test
from scipy.stats import shapiro

data = df['NE']

plt.hist(data)

stat, p = shapiro(data)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

In [None]:
from scipy.stats import pearsonr
import numpy as np
import math

rho = df.corr()
pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.05] if x<=t]))
corelation = rho.round(2).astype(str) + p
new = corelation.append(pval)
new

In [None]:
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from datetime import datetime
import scipy.stats  as stats


rho = df.corr()
pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.05] if x<=t]))
corelation = rho.round(2).astype(str) + p 
#concatenated = pd.concat([corelation, pval], join='inner', axis=1)
#new = pd.concat([corelation, pval])
#cor = corelation.merge(pval,left_index=True, right_index=True, how='left')
df_columns = corelation.columns.values
for i in range(0, len(corelation.index)):
    for j in range(0, len(corelation.columns)):
        print(corelation.values[i,j],pval.values[i,j],i,j)

In [None]:
stats.pearsonr(df.Biomasa, df.NE)

In [None]:
from scipy.stats import pearsonr
import numpy as np

rho = df.corr()
pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.05] if x<=t]))
corelation = rho.round(2).astype(str) + p
frames = [corelation, pval]
corelation.to_csv("correlation.csv")
corelation


In [None]:
#Important correlations
attrs = rho.iloc[:-1,:-1] # all except target
# only important correlations and not auto-correlations
threshold = 0.5
# {('LSTAT', 'TAX'): 0.543993, ('INDUS', 'RAD'): 0.595129, ...
important_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]).unstack().dropna().to_dict()
#unique correlations
unique_important_corrs = pd.DataFrame(list(set([(tuple(sorted(key)), important_corrs[key])
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.iloc[abs(unique_important_corrs['correlation']).argsort()[::-1]]
unique_important_corrs.to_csv("importantcorrelation.csv")
print(unique_important_corrs)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)

sns.set(style='white', font_scale=1.6)

g = sns.PairGrid(df, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_upper(corrdot)
g.savefig("output.png")

#plt.savefig("figure.png", format='png', dpi=350, transparent=True)

In [None]:
uthors <- data.frame(
    surname = c("Tukey", "Venables", "Tierney", "Ripley", "McNeil"),
    nationality = c("US", "Australia", "US", "UK", "Australia"),
    retired = c("yes", rep("no", 4)))
books <- data.frame(
    name = c("Tukey", "Venables", "Tierney", "Ripley", "Ripley", "McNeil"),
    title = c("Exploratory Data Analysis",
              "Modern Applied Statistics ...",
              "LISP-STAT",
              "Spatial Statistics", "Stochastic Simulation",
               "Interactive Data Analysis"),
    other.author = c(NA, "Ripley", NA, NA, NA, NA))

merge(authors, books, by.x="surname", by.y="name")
merge(books, authors, by.x="name", by.y="surname")