In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/marine-fish-dataset/Marine_Fish_Data.csv")

In [None]:
df.head().style.set_properties(**{'background-color':'red',
                                  'color':'white'})

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum().sum()

In [None]:
df.shape

In [None]:
pd.DataFrame(df.columns,columns=['column name']).style.set_properties(**{'background-color':'red',
                                  'color':'white'})

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
pd.DataFrame(df[['Species_Name', 'Region', 'Breeding_Season', 'Fishing_Method',
       'Fish_Population', 'Average_Size(cm)', 'Overfishing_Risk',
       'Water_Temperature(C)', 'Water_Pollution_Level']])

In [None]:
#Kategorik ve Sayısal Değişkenlerin Ayrılması
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Kategorik Değişkenler:", categorical_columns)
print("Sayısal Değişkenler:", numerical_columns)

In [None]:
# Eksik değerlerin sayısını kontrol et
missing_values = df.isnull().sum()
print(missing_values)

# Eksik değer yüzdesi
missing_percentage = df.isnull().mean() * 100
print(missing_percentage)

# Eksik değerleri doldurma (örneğin ortalama ile)
df['Average_Size(cm)'].fillna(df['Average_Size(cm)'].mean(), inplace=True)

# Eksik değerleri silme (tüm satırı veya sütunu)
df.dropna(inplace=True)

# Eksik değerlerin doldurulması sonrası kontrol
print(df.isnull().sum())


In [None]:
qualityRating = df['Water_Temperature(C)']

plt.figure(figsize=(10, 6))
sns.histplot(df['Water_Temperature(C)'], bins=20, kde=True, color='skyblue')

plt.xlabel('Water_Temperature(C)')
plt.ylabel('Frequency')
plt.title('Distribution of Water_Temperature(C)')

plt.show()

In [None]:
supplier_sizes = df['Species_Name'].value_counts()
supplier_sizes.plot(kind='pie', autopct='%1.1f%%', startangle=90, figsize=(10, 6), cmap='Set3')
plt.title('Species_Name')
plt.show()

In [None]:

for col in numerical_columns:
    print(col)
    print('Skew :', round(df[col].skew(), 2))
    plt.figure(figsize = (15, 4))
    plt.subplot(1, 2, 1)
    df[col].hist(grid=False)
    plt.ylabel('count')
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.show()

In [None]:
plt.figure(figsize=(20, 12))
for i, col in enumerate(categorical_columns):
    plt.subplot(3, 3, i+1)
    sns.countplot(y=col, data=df, palette='viridis')
    plt.title(f'{col} Dağılımı')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Water_Temperature(C)', y='Fish_Population', data=df_cleaned)
plt.title('Su Sıcaklığı vs Balık Popilasyonu')
plt.xlabel('Su Sıcaklığı (C)')
plt.ylabel('Balık Popülasyonu')
plt.grid(True)
plt.show()

In [None]:
# Türlere göre aşırı avlanma riski
plt.figure(figsize=(12, 6))
sns.countplot(x='Species_Name', hue='Overfishing_Risk', data=df, palette='viridis')
plt.title('Türlere göre aşırı avlanma riski')
plt.xlabel('Species Name')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
print(df.describe())

In [None]:
from sklearn.cluster import KMeans

# K-Means modelini eğitme
kmeans = KMeans(n_clusters=3)
df['Cluster'] = kmeans.fit_predict(numeric_df)

# Kümeleme sonuçlarını görselleştirme
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Water_Temperature(C)', y='Fish_Population', hue='Cluster', data=df, palette='viridis')
plt.title('K-Means Clustering')
plt.xlabel('Water Temperature (C)')
plt.ylabel('Fish Population')
plt.grid(True)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Sayısal sütunları standartlaştırma
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['Water_Temperature(C)', 'Fish_Population']])

# K-Means modelini tekrar eğitme
kmeans = KMeans(n_clusters=3)
df['Cluster'] = kmeans.fit_predict(scaled_features)

# Kümeleme sonuçlarını görselleştirme
plt.figure(figsize=(10, 6))
sns.scatterplot(x=scaled_features[:, 0], y=scaled_features[:, 1], hue=df['Cluster'], palette='viridis')
plt.title('Standartlaştırılmış K-Means Clustering')
plt.xlabel('Standardized Water Temperature (C)')
plt.ylabel('Standardized Fish Population')
plt.grid(True)
plt.show()

In [None]:
def find_outliers(column):
    q1 = np.percentile(column, 25)
    q3 = np.percentile(column, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers

numeric_columns = df.select_dtypes(include=np.number).columns
outliers_dict = {col: find_outliers(df[col]) for col in numeric_columns}

for col, outliers in outliers_dict.items():
    print(f"Variable '{col}': {outliers.sum()} outliers.")

In [None]:
correlation_matrix = df[numerical_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Heatmap")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

columns_cat = df.select_dtypes(include=['object']).columns.tolist()

df[columns_cat] = df[columns_cat].fillna(df[columns_cat].mode().iloc[0])

label_encoder = LabelEncoder()
df[columns_cat] = df[columns_cat].apply(lambda col: label_encoder.fit_transform(col))

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Heatmap")
plt.show()

In [None]:
for col in categorical_columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=col, y=numerical_columns[0], data=df)
    plt.title(f'{col} ile {numerical_columns[0]} Arasındaki İlişki')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Water_Temperature(C)', y='Fish_Population', data=df)
plt.title('Water Temperature vs Fish Population')
plt.xlabel('Water Temperature (C)')
plt.ylabel('Fish Population')
plt.grid(True)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

X = df[['Water_Temperature(C)']]
y = df['Fish_Population']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Gerçek Veri')
plt.plot(X, y_pred, color='red', label='Regresyon Eğrisi')
plt.title('Water Temperature vs Fish Population')
plt.xlabel('Water Temperature (C)')
plt.ylabel('Fish Population')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Log dönüşüm fonksiyonu
def log_transform(df, col):
    for colname in col:
        df[colname + '_log'] = np.log(df[colname] + 1)  # +1 eklenmesinin sebebi log(0)'ın tanımsız olması
    print(df.info())

# Log dönüşümünü uygulayalım
log_transform(df, ['Fish_Population', 'Water_Temperature(C)'])


In [None]:
# Fish_Population_log dağılımı
plt.figure(figsize=(10, 6))
sns.histplot(df['Fish_Population_log'], kde=True)
plt.title('Fish Population (Log Transformed) Distribution')
plt.xlabel('Fish Population (Log)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Water_Temperature_log dağılımı
plt.figure(figsize=(10, 6))
sns.histplot(df['Water_Temperature(C)_log'], kde=True)
plt.title('Water Temperature (Log Transformed) Distribution')
plt.xlabel('Water Temperature (Log)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# Gereksiz sütunları veri setinden çıkarma
df_dropped = df.drop(['Species_Name', 'Cluster'], axis=1)

# Pairplot oluşturma
sns.pairplot(df_dropped)
plt.show()