In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Read Data

In [14]:
path = "../../../data/"
file_name = path + "Carbon-Data-ALPIDE-CHIP-Longitudinal/151218_04237.csv"
conv = {"column": lambda vol: vol.strip("[]").split(";"),
        "row": lambda vol: vol.strip("[]").split(";")}
df = pd.read_csv(file_name, converters=conv)

In [15]:
df.head()

Unnamed: 0,ru_id,frame_id,stave_id,chip_id,abs_time,bunch_counter,column,row
0,1,1.0,0,5,3949260000.0,54,"[1, 1, 0, 0, 1, 1, 2, 3, 4, 4, 5, 5, 7, 7, 7, ...","[48, 239, 263, 297, 305, 370, 332, 433, 250, 3..."
1,1,2.0,0,5,3949380000.0,154,"[1, 1, 0, 0, 1, 1, 2, 3, 4, 4, 5, 5, 7, 7, 7, ...","[48, 239, 263, 297, 305, 370, 332, 433, 250, 3..."
2,1,3.0,0,5,3949500000.0,64,"[1, 1, 0, 0, 1, 1, 2, 3, 4, 4, 5, 5, 7, 7, 7, ...","[48, 239, 263, 297, 305, 370, 332, 433, 250, 3..."
3,1,6.0,0,5,3949860000.0,106,[829],[478]
4,1,14.0,0,5,3950820000.0,11,"[957, 957, 958, 958]","[421, 422, 421, 422]"


## Dataframe describe

In [16]:
df.describe()

Unnamed: 0,ru_id,frame_id,stave_id,chip_id,abs_time,bunch_counter
count,4960.0,4960.0,4960.0,4960.0,4960.0,4960.0
mean,1.0,56205220.0,0.0,5.0,2058590000.0,116.111492
std,0.0,37451090.0,0.0,0.0,1207692000.0,69.589795
min,1.0,1.0,0.0,5.0,90229.0,0.0
25%,1.0,23039500.0,0.0,5.0,1161115000.0,57.0
50%,1.0,55199600.0,0.0,5.0,2078060000.0,113.0
75%,1.0,82200500.0,0.0,5.0,3117590000.0,174.0
max,1.0,129487000.0,0.0,5.0,4294940000.0,255.0


## Check what values are in the each column
* Look at the values.
* make a brief hypothesis.

In [17]:
df['ru_id'].unique()

array([1], dtype=int64)

* 'ru_id' column has only value '1'.
* This value looks not important.

In [18]:
print(df['frame_id'].unique())
print(df['frame_id'].nunique())
print(df['frame_id'].isnull().values.any())

[1.00000e+00 2.00000e+00 3.00000e+00 ... 1.29484e+08 1.29486e+08
 1.29487e+08]
1128
False


* 'frame_id' column has a lot of values.
* The number of data is 4960, and the number of union ``frame_id`` is 1128.
* There is no null value.
* This means there are data that have same ``frame_id``

In [19]:
df['stave_id'].unique()

array([0], dtype=int64)

* 'ru_id' column has only value '0'.
* This value looks not important.

In [20]:
df['chip_id'].unique()

array([5], dtype=int64)

* 'ru_id' column has only value '5'.
* This value looks not important.

In [21]:
print(df['abs_time'].unique())
print(df['abs_time'].nunique())
print(df['abs_time'].isnull().values.any())

[3.94926e+09 3.94938e+09 3.94950e+09 ... 2.65462e+09 2.65474e+09
 2.65498e+09]
4784
False


* This value has a lot of data.
* We can expect from column name that this value indicates the time.
* There are no null value, and this means there are data that have same abs_time.

In [22]:
print(df['bunch_counter'].unique())
print(df['bunch_counter'].nunique())
print(min(df['bunch_counter']))
print(max(df['bunch_counter']))

[ 54 154  64 106  11  82 100   9 222 183 244  63 110  24 237 190 242 137
 203 151   8 217  60 122 178 235 239  92 135 196 149   6 158   1  53 128
 189 194  69 105 162 114  37  80  42  94 155 102 116 159 125 238 181  91
 191 134 243 247  57 218 227  84  98  41 140  68 163  25 229 147 208 226
 169 231  88 131 144 165  13  74  79 187  49  20 171 133 185  38  47 103
  22  30  39 101  44  19 166  71  33 146  93  55 177  87 236 179  50 202
  59  73 115 167  86   0 108 170 113  83 206  77 124  43  52 212 112 182
  90  65  26 252 157  76  75 193   3   7 168  35 143 204 161 132 141  97
 224 176  95  99  45 201 210 219  15 129 233  31 117 121 225 175 107  21
 172 111  72 186 251 109  66  61 126  36 153 120 199 104 255 230 214 232
 184 145 150  16  34  23  89 195   4 160 174  96 180  12 130  32  51   5
  28 136 240 192 119 142  27  40 148 241 246 164 139  85 234 213 127  29
  14  56 156  18 118  70 188 173   2  58  67 216  17  10  78 123 138  48
 152 248 200 209  62 245 215  81 223 211 197 228  4

* This column has a lot of data.
* The number of data is 4960, and the number of union ``bunch_counter`` is 256.
* This means there are data that have same ``bunch_counter``
* minimum ``bunch_counter`` is 0, and maximum `bunch_counter`` is 255.

#### column and row data
* column and row has many data. This looks like a photo at the moment(column: x axis, row : y axis)

### ○ data exploration conclusion
* I decided to drop 'ru_id', 'stave_id', 'chip_id' columns.