<a href="https://colab.research.google.com/github/Pratheek-05/codtech-projects/blob/main/data_analytics_poject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install PySpark (if not already installed)
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd  # Import pandas for reading CSV from URL

# Initialize a Spark session
spark = SparkSession.builder.appName("BigDataAnalysis").getOrCreate()

# Load a sample dataset (can replace with a large CSV file)
# Read CSV data from URL using pandas
pandas_df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv")
# Create a Spark DataFrame from the pandas DataFrame
df = spark.createDataFrame(pandas_df)

# Display the schema
df.printSchema()

# Show some sample data
df.show(5)

# Perform basic analysis: Calculate average tip amount per day
avg_tip = df.groupBy("day").avg("tip")
avg_tip.show()

# Stop the Spark session
spark.stop()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows

+----+------------------+
| day|          avg(tip)|
+----+------------------+
|Thur| 2.771451612903226|
| Sun|3.2551315789473687|
| Sat|2.9931034482758623|
| Fri| 2.734736842105263|
+----+------------------+



In [3]:
# Install required libraries
!pip install scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset (Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['target'], test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [4]:
# Install required libraries
!pip install dash pandas

import dash
from dash import dcc, html
import pandas as pd
import plotly.express as px
from dash.dependencies import Input, Output

# Sample dataset (can replace with real dataset)
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")

# Initialize Dash app
app = dash.Dash(__name__)

# Layout
app.layout = html.Div([
    html.H1("Iris Dataset Dashboard"),
    dcc.Dropdown(id="feature", options=[{"label": col, "value": col} for col in df.columns if col != "species"],
                 value="sepal_length"),
    dcc.Graph(id="scatter-plot"),
])

# Callback to update the graph
@app.callback(
    Output("scatter-plot", "figure"),
    Input("feature", "value")
)
def update_graph(selected_feature):
    fig = px.histogram(df, x=selected_feature, color="species", title=f"Distribution of {selected_feature}")
    return fig

# Run the app (uncomment the next line to run locally in Colab)
# app.run_server(debug=True, use_reloader=False)


Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-2.18.2-py3-none-any.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_compo

In [5]:
# Install required libraries
!pip install nltk pandas

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Download the necessary NLTK data
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Sample textual data
reviews = [
    "I love this product! It's amazing.",
    "This is the worst experience I've ever had.",
    "It's okay, but could be better.",
    "Absolutely fantastic service!",
    "I hate it so much."
]

# Perform sentiment analysis
results = [{"Review": review, "Sentiment Score": sia.polarity_scores(review)['compound']} for review in reviews]

# Convert to DataFrame and display
df = pd.DataFrame(results)
print(df)


                                        Review  Sentiment Score
0           I love this product! It's amazing.           0.8516
1  This is the worst experience I've ever had.          -0.6249
2              It's okay, but could be better.           0.6486
3                Absolutely fantastic service!           0.6352
4                           I hate it so much.          -0.5719


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
