In [5]:
import pandas as pd
import numpy as np
import random

In [6]:

data = pd.read_csv("adventuredataset.csv")

In [7]:
data.head()

Unnamed: 0,Movie_ID,Title,Year,Genre,Director,Budget_Million,BoxOffice_Million,Runtime_Minutes,Rating,Award_Won
0,MOV000001,Adventure Film 1,2018,Adventure,Director A,82.078031,441.753825,114.155228,5.0,Yes
1,MOV000002,Adventure Film 2,2008,Adventure,Director A,108.957053,86.884441,114.693611,4.7,Yes
2,MOV000003,Adventure Film 3,1994,Adventure,Director C,37.795688,87.241374,113.393664,9.7,No
3,MOV000004,Adventure Film 4,2022,Sci-Fi,Director D,120.161778,231.401774,106.589559,6.3,No
4,MOV000005,Adventure Film 5,1987,Adventure,Director C,132.718911,70.943526,142.04464,6.5,Yes


In [7]:

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Movie_ID           80000 non-null  object 
 1   Title              80000 non-null  object 
 2   Year               80000 non-null  int64  
 3   Genre              80000 non-null  object 
 4   Director           80000 non-null  object 
 5   Budget_Million     80000 non-null  float64
 6   BoxOffice_Million  80000 non-null  float64
 7   Runtime_Minutes    80000 non-null  float64
 8   Rating             80000 non-null  float64
 9   Award_Won          80000 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 6.1+ MB
None


In [8]:

print(data.describe())

               Year  Budget_Million  BoxOffice_Million  Runtime_Minutes  \
count  80000.000000    80000.000000       80000.000000     80000.000000   
mean    2001.509425      100.703075         200.406060       126.957796   
std       12.660702       48.357263          97.593747        14.465309   
min     1980.000000       10.000000           5.000000        80.000000   
25%     1991.000000       66.286950         132.262816       116.967810   
50%     2002.000000      100.072418         199.069221       126.128505   
75%     2012.000000      133.697708         266.389786       136.303363   
max     2023.000000      289.109392         621.936634       180.000000   

             Rating  
count  80000.000000  
mean       5.498771  
std        2.593778  
min        1.000000  
25%        3.300000  
50%        5.500000  
75%        7.700000  
max       10.000000  


In [8]:

print(data["Genre"].value_counts())

Genre
Adventure    56110
Sci-Fi        8021
Fantasy       7951
Action        7918
Name: count, dtype: int64


In [9]:

print("Average Box Office Revenue (Million USD):"
      , data["BoxOffice_Million"].mean())

Average Box Office Revenue (Million USD): 200.40606034174928


In [10]:

max_budget = data["Budget_Million"].max()
print("Maximum Budget (Million USD):", max_budget)
print(data[data["Budget_Million"] == max_budget])

Maximum Budget (Million USD): 289.10939244983206
        Movie_ID                 Title  Year      Genre    Director  \
36373  MOV036374  Adventure Film 36374  2020  Adventure  Director A   

       Budget_Million  BoxOffice_Million  Runtime_Minutes  Rating Award_Won  
36373      289.109392         243.903081        116.06643     1.3        No  


In [11]:

award_count = data["Award_Won"].value_counts()
print("Movies with Awards:")
print(award_count)

Movies with Awards:
Award_Won
No     56148
Yes    23852
Name: count, dtype: int64


In [13]:

print("Median Runtime (Minutes):", data["Runtime_Minutes"].median())

Median Runtime (Minutes): 126.12850455408491


In [12]:

correlation = data["Budget_Million"].corr(data["BoxOffice_Million"])
print("Correlation between Budget and Box Office Revenue:", correlation)

Correlation between Budget and Box Office Revenue: 0.0004596886358127435


In [13]:

top_directors = data["Director"].value_counts().head(5)
print("Top 5 Directors by Movie Count:")
print(top_directors)

Top 5 Directors by Movie Count:
Director
Director E    16051
Director B    16050
Director D    16007
Director C    15992
Director A    15900
Name: count, dtype: int64


In [14]:

yearly_revenue = data.groupby("Year")["BoxOffice_Million"].mean()
highest_year = yearly_revenue.idxmax()
highest_revenue = yearly_revenue.max()
print("Year with Highest Average Box Office Revenue:", highest_year)
print("Highest Average Revenue (Million USD):", highest_revenue)

Year with Highest Average Box Office Revenue: 2012
Highest Average Revenue (Million USD): 205.1806124573583


In [15]:

print("Rating Distribution:")
print(data["Rating"].value_counts(bins=10, sort=False))

Rating Distribution:
(0.99, 1.9]    8340
(1.9, 2.8]     8038
(2.8, 3.7]     8126
(3.7, 4.6]     8009
(4.6, 5.5]     8007
(5.5, 6.4]     7985
(6.4, 7.3]     8044
(7.3, 8.2]     7919
(8.2, 9.1]     7990
(9.1, 10.0]    7542
Name: count, dtype: int64


In [18]:

genre_percentage = data["Genre"].value_counts(normalize=True) * 100
print("Percentage of Movies by Genre:")
print(genre_percentage)

Percentage of Movies by Genre:
Genre
Adventure    70.13750
Sci-Fi       10.02625
Fantasy       9.93875
Action        9.89750
Name: proportion, dtype: float64


In [16]:

award_percentage = (data["Award_Won"].value_counts(normalize=True) * 100).loc["Yes"]
print("Percentage of Movies Winning Awards:", award_percentage)

Percentage of Movies Winning Awards: 29.815


In [5]:

print("Genre-wise Award-Winning Movie Counts:")
print(data[data["Award_Won"] == "Yes"].groupby("Genre").size())

Genre-wise Award-Winning Movie Counts:
Genre
Action        2504
Adventure    16585
Fantasy       2363
Sci-Fi        2400
dtype: int64


In [6]:

runtime_rating_corr = data["Runtime_Minutes"].corr(data["Rating"])
print("Correlation between Runtime and Rating:", runtime_rating_corr)

Correlation between Runtime and Rating: -0.007358217930980107


In [17]:

print("Box Office Revenue Range Segmentation:")
data["Revenue_Range"] = pd.cut(data["BoxOffice_Million"], bins=[0, 100, 300, 600, 1000], labels=["Low", 
                                                                                        "Moderate", "High", "Blockbuster"])
print(data["Revenue_Range"].value_counts())

Box Office Revenue Range Segmentation:
Revenue_Range
Moderate       54669
Low            12690
High           12640
Blockbuster        1
Name: count, dtype: int64


In [18]:

print("Top Director-Genre Pairs by Average Box Office Revenue:")
director_genre_pairs = data.groupby(["Director", "Genre"])["BoxOffice_Million"].mean().sort_values(ascending=False).head(10)
print(director_genre_pairs)

Top Director-Genre Pairs by Average Box Office Revenue:
Director    Genre    
Director E  Fantasy      207.204179
Director A  Action       205.251794
            Fantasy      204.438640
Director D  Action       204.294007
Director C  Fantasy      203.523128
Director D  Sci-Fi       202.476883
Director B  Sci-Fi       202.241365
Director D  Adventure    201.721351
Director C  Sci-Fi       201.277750
Director A  Adventure    200.862617
Name: BoxOffice_Million, dtype: float64


In [24]:

print("Award-Winning Trends Over Years:")
award_trend = data[data["Award_Won"] == "Yes"].groupby("Year").size()
print(award_trend)

Award-Winning Trends Over Years:
Year
1980    559
1981    538
1982    575
1983    547
1984    555
1985    529
1986    529
1987    532
1988    534
1989    549
1990    534
1991    540
1992    571
1993    531
1994    525
1995    537
1996    602
1997    551
1998    574
1999    519
2000    540
2001    536
2002    606
2003    545
2004    522
2005    598
2006    540
2007    591
2008    525
2009    539
2010    519
2011    533
2012    500
2013    537
2014    493
2015    578
2016    532
2017    528
2018    537
2019    513
2020    527
2021    529
2022    496
2023    557
dtype: int64


In [19]:
# Analyzing budget outliers
print("Budget Outliers Analysis:")
q1 = data["Budget_Million"].quantile(0.25)
q3 = data["Budget_Million"].quantile(0.75)
iqr = q3 - q1
outliers = data[(data["Budget_Million"] < (q1 - 1.5 * iqr)) | (data["Budget_Million"] > (q3 + 1.5 * iqr))]
print("Number of Budget Outliers:", len(outliers))

Budget Outliers Analysis:
Number of Budget Outliers: 264


In [20]:
# Rating distribution analysis by genre
print("Rating Distribution by Genre:")
print(data.groupby("Genre")["Rating"].describe())

Rating Distribution by Genre:
             count      mean       std  min  25%  50%  75%   max
Genre                                                           
Action      7918.0  5.540566  2.586453  1.0  3.3  5.6  7.8  10.0
Adventure  56110.0  5.492836  2.596140  1.0  3.2  5.5  7.7  10.0
Fantasy     7951.0  5.457024  2.584686  1.0  3.3  5.4  7.7  10.0
Sci-Fi      8021.0  5.540419  2.592894  1.0  3.3  5.6  7.8  10.0


In [21]:

print("Runtime Impact on Box Office Revenue:")
runtime_revenue_corr = data["Runtime_Minutes"].corr(data["BoxOffice_Million"])
print("Correlation between Runtime and Box Office Revenue:", runtime_revenue_corr)

Runtime Impact on Box Office Revenue:
Correlation between Runtime and Box Office Revenue: -0.002474423353551728


In [28]:

print("Box Office Performance by Decade:")
data["Decade"] = (data["Year"] // 10) * 10
decade_performance = data.groupby("Decade")["BoxOffice_Million"].mean()
print(decade_performance)

Box Office Performance by Decade:
Decade
1980    200.347302
1990    200.740989
2000    200.091444
2010    200.823270
2020    199.472841
Name: BoxOffice_Million, dtype: float64


In [22]:

print("Most Common Budget Range:")
data["Budget_Range"] = pd.cut(data["Budget_Million"], bins=[0, 50, 100, 200, 300], labels=["Low",
                                                                                           "Moderate", "High", "Very High"])
print(data["Budget_Range"].value_counts())

Most Common Budget Range:
Budget_Range
High         38253
Moderate     27278
Low          12674
Very High     1795
Name: count, dtype: int64


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [51]:

X_class = data['Rating']  # Features for classification
y_class = data['Award_Won']  # Target variable (binary)

In [52]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Generate a sample dataset (if you don't have one)
# Assume binary classification (0 or 1)
np.random.seed(42)
num_samples = 100

# Creating random features (X) and a binary target variable (y)
X = np.random.rand(num_samples, 3)  # 3 features
y = np.random.randint(0, 2, num_samples)  # Target (0 or 1)

# Convert X to a DataFrame
X = pd.DataFrame(X, columns=['Feature1', 'Feature2', 'Feature3'])

# Split the dataset into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the logistic regression model
log_model = LogisticRegression()
log_model.fit(X_train_class, y_train_class)

# Make predictions
y_pred = log_model.predict(X_test_class)

# Evaluate the model
accuracy = accuracy_score(y_test_class, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_class, y_pred))

# Print model coefficients and intercept
print("\nModel Coefficients:", log_model.coef_)
print("Model Intercept:", log_model.intercept_)


Model Accuracy: 0.55

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.20      0.31        10
           1       0.53      0.90      0.67        10

    accuracy                           0.55        20
   macro avg       0.60      0.55      0.49        20
weighted avg       0.60      0.55      0.49        20


Model Coefficients: [[-0.33991252  0.57538815  0.00959808]]
Model Intercept: [0.0699518]


In [55]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()


In [49]:
# Make predictions on the test data
y_pred_class = log_model.predict(X_test_class)
y_pred_class

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [56]:
# Evaluate the model performance
accuracy = accuracy_score(y_test_class, y_pred_class)
conf_matrix = confusion_matrix(y_test_class, y_pred_class)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.55
Confusion Matrix:
[[2 8]
 [1 9]]
