<a href="https://colab.research.google.com/github/yomyaykya/yomyay/blob/master/Fingerprint2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [4]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [5]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [7]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# fingerprint.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('/content/fingerprint.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'fingerprint.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 271 columns


In [8]:
df1.head(5)

Unnamed: 0,Sample1,Sample2,Sample3,Sample4,Sample5,Sample6,Sample7,Sample8,Sample9,Sample10,Sample11,Sample12,Sample13,Sample14,Sample15,Sample16,Sample17,Sample18,Sample19,Sample20,Sample21,Sample22,Sample23,Sample24,Sample25,Sample26,Sample27,Sample28,Sample29,Sample30,Sample31,Sample32,Sample33,Sample34,Sample35,Sample36,Sample37,Sample38,Sample39,Sample40,...,Sample232,Sample233,Sample234,Sample235,Sample236,Sample237,Sample238,Sample239,Sample240,Sample241,Sample242,Sample243,Sample244,Sample245,Sample246,Sample247,Sample248,Sample249,Sample250,Sample251,Sample252,Sample253,Sample254,Sample255,Sample256,Sample257,Sample258,Sample259,Sample260,Sample261,Sample262,Sample263,Sample264,Sample265,Sample266,Sample267,Sample268,Sample269,Sample270,Sample271
0,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,...,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right,right
1,0.33364,0.62579,0,0,0.56233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01711,0,0,0,0,0,0,0,0,0.0259,0,0,0,0,0.01411,0,0,0,0,1.28245,0.00271,...,0,0,0,0,4.94008,0,0,0.03616,0,0,0,0,0,0,1.28378,0,0,4.18694,0,0,0,0,0,0,0,0.22378,0,0,0,0,0,6.11234,0,0,0,0,0,0,0,0
2,0.33364,0.62579,0,0,0.56233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01711,0,0,0,0,0,0,0,0,0.0259,0,0,0,0,0.01411,0,0,0,0,1.28245,0.00271,...,0,0,0,0,4.94008,0,0,0.03616,0,0,0,0,0,0,1.28378,0,0,4.18694,0,0,0,0,0,0,0,0.22378,0,0,0,0,0,6.11234,0,0,0,0,0,0,0,0
3,0.33364,0.62579,0,0,0.56233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01711,0,0,0,0,0,0,0,0,0.0259,0,0,0,0,0.01411,0,0,0,0,1.28245,0.00271,...,0,0,0,0,4.93606,0,0,0.03616,0,0,0,0,0,0,1.28378,0,0,4.18694,0,0,0,0,0,0,0,0.22378,0,0,0,0,0,6.10627,0,0,0,0,0,0,0,0
4,0.33364,0.62579,0,0,0.56233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01711,0,0,0,0,0,0,0,0,0.0259,0,0,0,0,0.01411,0,0,0,0,1.28245,0.00271,...,0,0,0,0,4.93606,0,0,0.03616,0,0,0,0,0,0,1.28378,0,0,4.18694,0,0,0,0,0,0,0,0.22378,0,0,0,0,0,6.10627,0,0,0,0,0,0,0,0


In [9]:
plotPerColumnDistribution(df1, 10, 5)

<Figure size 2400x512 with 0 Axes>