# Data Analis
# Introduction to Machine Learning


## Data Transformation

**Agregasi fitur**

In [3]:
import pandas as pd
# Data sintetis
data= pd.DataFrame({'A':[1,2,3,4,5],
                    'B':[6,7,8,9,10],
                    'C':[11,12,13,14,15]})
# Mengagregasikan fitur 'A' dan 'B' dengan mengambil rata-rata
data['D']= (data['A'] + data['B']) /2
# Menghapus fitur 'A' dan 'B'
data = data.drop(columns=['A','B'])
data

Unnamed: 0,C,D
0,11,3.5
1,12,4.5
2,13,5.5
3,14,6.5
4,15,7.5


**Reduksi Data dengan Menggunkan PCA**

In [4]:
import pandas as pd
from sklearn.decomposition import PCA

# Data sintetis
data= pd.DataFrame({'A':[1,2,3,4,5],
                    'B':[6,7,8,9,10],
                    'C':[11,12,13,14,15]})

# Menggunakn PCA untuk mengurangi dimensi dataset menjadi 2 fitur
pca = PCA(n_components=2 )
transformed_data = pca.fit_transform(data)

# Menyimpan data set yang telah direduksi
data_reduced = pd.DataFrame(transformed_data, columns=['PC1','PC2'])
data_reduced

Unnamed: 0,PC1,PC2
0,-3.464102,-3.4399e-16
1,-1.732051,1.146633e-16
2,0.0,0.0
3,1.732051,-1.146633e-16
4,3.464102,-2.293267e-16


Encoding Fitur

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Sintetis data yang berupa teks
data = pd.DataFrame({'A':['this is a sample sentence',
                          'another example sentence',
                          'a third text for illustration']})
# Menggunakan TF-IDF untuk mengonversi tect ke bentuk vektor
vektorizer = TfidfVectorizer(stop_words=None, token_pattern=r'\b\w+\b')
encoded_data = vektorizer.fit_transform(data['A'])

# Mengubah vektor menjadi data frame
data_encoded = pd.DataFrame(encoded_data.toarray(), columns=vektorizer.get_feature_names_out())
data_encoded

Unnamed: 0,a,another,example,for,illustration,is,sample,sentence,text,third,this
0,0.373022,0.0,0.0,0.0,0.0,0.490479,0.490479,0.373022,0.0,0.0,0.490479
1,0.0,0.622766,0.622766,0.0,0.0,0.0,0.0,0.47363,0.0,0.0,0.0
2,0.355432,0.0,0.0,0.467351,0.467351,0.0,0.0,0.0,0.467351,0.467351,0.0


**Data Scalling**

In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# dara sintetis
data = pd.DataFrame({'A': [1,2,3,4,5],
                     'B': [100,200,300,400,500]})

# Menggunakan StandardScaler untuk menskalakan data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Mengubah data yang telah diskalakan menjadi data frame
data_scaled = pd.DataFrame(scaled_data, columns=['A_scaled','B_scaled'])
data_scaled

Unnamed: 0,A_scaled,B_scaled
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


**Data Normalization**

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# dara sintetis
data = pd.DataFrame({'A': [1,2,3,4,5],
                     'B': [100,200,300,400,500]})

# Menggunakan MinMaxScaler untuk menormalisasikan data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)

# Mengubah data yang telah diskalakan menjadi data frame
data_normalized = pd.DataFrame(normalized_data, columns=['A_normalized','B_normalized'])
data_normalized

Unnamed: 0,A_normalized,B_normalized
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0
