In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
#setting seaborn style
sns.set_style('darkgrid')

In [3]:
df = pd.read_csv('mdc.csv', encoding='latin-1', index_col=0)

FileNotFoundError: File b'mdc.csv' does not exist

In [None]:
df.head()

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df['genre'].unique()

In [None]:
genre_count = df['genre'].value_counts()

In [None]:
#most common genre

genre_count.plot(kind='bar')

In [None]:
#most common genre entitywise

grp_entity = df.groupby('entity')
grp_entity['genre'].value_counts()

In [None]:
grp_entity.median()

In [None]:
grp_entity.plot()

In [None]:
sns.barplot(x='entity', y='imdb_rating', data=df)

In [None]:
sns.countplot(x='entity', data=df)

In [None]:
sns.pairplot(data=df)

In [None]:
#highest imdb ratings

df.nlargest(2, ['imdb_rating'])

In [None]:
#median duration genrewise 

grp_genre = df.groupby('genre')
grp_genre.median()['runtime'].sort_values(ascending=False)[:5]

In [None]:
#Most common genre yearwise

grp_year = df.groupby('year')
grp_year['genre'].value_counts()[55:]

In [None]:
grp_genre.median()['imdb_rating'].sort_values(ascending=False)[:5]

In [None]:
grp_genre.median()['imdb_rating'].sort_values(ascending=False)[:5].plot(kind='bar')

In [None]:
#median imdb rating of "Action, Adventure, Sci-Fi" genre (because its the most common genre)

grp_genre.median()['imdb_rating'].loc['Action, Adventure, Sci-Fi']

In [None]:
#top10 imdb ratings grouped by entity and genre

grp_two = df.groupby(['genre', 'entity'])
grp_two.median()['imdb_rating'].sort_values(ascending=False)[:10]

In [None]:
grp_two.median()['imdb_rating'].sort_values(ascending=False)[:5].plot(kind='bar')

# Linear Regression

In [None]:
y = np.array(df['imdb_rating']).reshape(-1, 1)
X = np.array(df['imdb_gross']).reshape(-1, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
lreg = LinearRegression()

In [None]:
lreg.fit(X_train, y_train)

In [None]:
p = lreg.predict(X_test)

In [None]:
print(lreg.score(X_test, y_test))

In [None]:
df_p = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': p.flatten()})
df_p.head()

In [None]:
mean_squared_error(y_test, p)

# Decision Tree Classifier

In [None]:
dummy = pd.get_dummies(df['entity'])

In [None]:
dummy.head()

In [None]:
df2 = pd.concat((df, dummy), axis=1)

In [None]:
df2 = df2.drop('MARVEL', axis=1)

In [None]:
#universe: 0 is marvel, 1 is DC
df2 = df2.rename(columns={'DC': 'uni'})

In [None]:
df2.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
X = np.array(df2['imdb_rating']).reshape(-1, 1)
#universe: 0 is marvel, 1 is DC
y = np.array(df2['uni']).reshape(-1, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
dtc.fit(X_train, y_train)

In [None]:
y_pred = dtc.predict(X_test)

In [None]:
ans = pd.DataFrame({'predicted': y_pred.flatten(), 'actual': y_test.flatten()})
ans.head()

In [None]:
dtc.score(X_test, y_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
X = np.array(df2['imdb_rating']).reshape(-1, 1)
#universe: 0 is marvel, 1 is DC
y = np.array(df2['uni']).reshape(-1, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
logistic_reg = LogisticRegression()

In [None]:
logistic_reg.fit(X_train, y_train)

In [None]:
y_pred = logistic_reg.predict(X_test)

In [None]:
prediction = pd.DataFrame({'predicted': y_pred.flatten(), 'actual': y_test.flatten()})
prediction

In [None]:
score = logistic_reg.score(X_test, y_test)

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt='.3f', linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = np.array(df2['imdb_rating']).reshape(-1, 1)
#universe: 0 is marvel, 1 is DC
y = np.array(df2['uni']).reshape(-1, 1)

In [None]:
knn = KNeighborsClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
prediction = pd.DataFrame({'predicted': y_pred.flatten(), 'actual': y_test.flatten()})

In [None]:
prediction.head()

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
cm

In [None]:
score = knn.score(X_test, y_test)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt='.3f', linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);