In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import openai
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import streamlit as st
# Встановлення ключа OpenAI
openai.api_key = "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"



# Streamlit page configuration
st.set_page_config(page_title="Data Analysis and ML", page_icon="📊", layout="wide")

# App title
st.markdown("<h1 style='text-align: center;'>Data Analysis & Machine Learning 📊</h1>", unsafe_allow_html=True)

# File uploader
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
if uploaded_file:
    data = pd.read_csv(uploaded_file)
    st.write("Preview of Data:")
    st.dataframe(data.head())

    # Descriptive statistics
    st.subheader("Descriptive Statistics")
    numeric_columns = data.select_dtypes(include=[np.number]).columns

    if not numeric_columns.empty:
        st.write("Numeric columns detected:")
        st.write(numeric_columns.tolist())

        mean_values = data[numeric_columns].mean()
        median_values = data[numeric_columns].median()
        mode_values = data[numeric_columns].mode().iloc[0]
        var_values = data[numeric_columns].var()
        std_values = data[numeric_columns].std()
        skew_values = data[numeric_columns].skew()
        kurt_values = data[numeric_columns].kurt()
        range_values = data[numeric_columns].max() - data[numeric_columns].min()

        st.write("Mean:")
        st.write(mean_values)
        st.write("Median:")
        st.write(median_values)
        st.write("Mode:")
        st.write(mode_values)
        st.write("Variance:")
        st.write(var_values)
        st.write("Standard Deviation:")
        st.write(std_values)
        st.write("Skewness:")
        st.write(skew_values)
        st.write("Kurtosis:")
        st.write(kurt_values)
        st.write("Range:")
        st.write(range_values)

        # Visualization
        st.subheader("Data Distribution Visualization")
        for col in numeric_columns:
            st.write(f"Distribution for {col}")
            fig, ax = plt.subplots(1, 2, figsize=(10, 4))

            sns.histplot(data[col], kde=True, ax=ax[0], color="skyblue")
            ax[0].axvline(mean_values[col], color="red", linestyle="--", label="Mean")
            ax[0].axvline(median_values[col], color="green", linestyle="--", label="Median")
            ax[0].set_title("Histogram")
            ax[0].legend()

            sns.boxplot(x=data[col], ax=ax[1], color="lightgreen")
            ax[1].set_title("Boxplot")

            st.pyplot(fig)

            # AI Explanation
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a data analysis assistant."},
                        {"role": "user", "content": f"Explain the histogram and boxplot for column {col}. Mean: {mean_values[col]}, Median: {median_values[col]}, Mode: {mode_values[col]}, Variance: {var_values[col]}, Std Dev: {std_values[col]}, Skew: {skew_values[col]}, Kurtosis: {kurt_values[col]}"}
                    ]
                )
                explanation = response["choices"][0]["message"]["content"]
                st.markdown(f"**Explanation for {col}:**")
                st.write(explanation)
            except Exception as e:
                st.error(f"OpenAI API error: {str(e)}")

        # Correlation Matrix
        st.subheader("Correlation Matrix")
        corr_matrix = data[numeric_columns].corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", ax=ax)
        st.pyplot(fig)

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a data analysis assistant."},
                    {"role": "user", "content": f"Explain the correlation matrix: {corr_matrix.to_dict()}"}
                ]
            )
            explanation = response["choices"][0]["message"]["content"]
            st.markdown("**Correlation Explanation:**")
            st.write(explanation)
        except Exception as e:
            st.error(f"OpenAI API error: {str(e)}")

        # Clustering
        st.subheader("Clustering")
        n_clusters = st.slider("Select number of clusters", 2, 10, 3)

        st.subheader("Elbow Method to Determine Optimal Clusters")
        if st.button("Generate Elbow Plot"):
            try:
                data_for_clustering = data[numeric_columns]
                wcss = []
                for i in range(1, 11):
                    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
                    kmeans.fit(data_for_clustering)
                    wcss.append(kmeans.inertia_)

                fig, ax = plt.subplots()
                ax.plot(range(1, 11), wcss, marker="o", linestyle="--")
                ax.set_title("Elbow Method")
                ax.set_xlabel("Number of Clusters")
                ax.set_ylabel("WCSS")
                st.pyplot(fig)
            except Exception as e:
                st.error(f"Elbow plot error: {str(e)}")

        if st.button("Perform Clustering"):
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                data['Cluster'] = kmeans.fit_predict(data[numeric_columns])
                st.dataframe(data.head())

                fig, ax = plt.subplots()
                scatter = ax.scatter(data[numeric_columns[0]], data[numeric_columns[1]], c=data['Cluster'], cmap='viridis')
                ax.set_xlabel(numeric_columns[0])
                ax.set_ylabel(numeric_columns[1])
                ax.set_title("Cluster Distribution")
                plt.legend(*scatter.legend_elements(), title="Clusters")
                st.pyplot(fig)
            except Exception as e:
                st.error(f"Clustering error: {str(e)}")

        # Linear Regression
        st.subheader("Linear Regression")
        x_col = st.selectbox("Select X (feature)", numeric_columns, key="x_col")
        y_col = st.selectbox("Select Y (target)", numeric_columns, key="y_col")

        if st.button("Run Regression"):
            try:
                X = data[[x_col]].values
                y = data[y_col].values
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                model = LinearRegression()
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                st.write(f"R² Score: {model.score(X_test, y_test):.3f}")

                fig, ax = plt.subplots()
                ax.scatter(X_test, y_test, color="blue", label="Actual")
                ax.plot(X_test, y_pred, color="red", label="Prediction", linewidth=2)
                ax.set_xlabel(x_col)
                ax.set_ylabel(y_col)
                ax.set_title("Linear Regression")
                ax.legend()
                st.pyplot(fig)
            except Exception as e:
                st.error(f"Regression error: {str(e)}")


2025-04-16 22:59:33.876 
  command:

    streamlit run C:\Users\reung\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
