In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

In [None]:
df=pd.read_csv('/Users/yuwei/Documents/学习资料/7008/作业/project/twitchdata-update.csv',encoding='latin-1')
df.head()

In [None]:
#Looking some information of the data
print(df.shape)
print(df.nunique())

In [None]:
df.describe().T

In [None]:
#explore if the following data have a normal distribution
df['Watch times(Minutes)_log'] = np.log(df['Watch time(Minutes)'])
df['Average viewers_log'] = np.log(df['Average viewers'])
df['Peak viewers_log'] = np.log(df['Peak viewers'])
df['Followers gained_log'] = np.log(df['Followers gained'])

plt.figure(figsize = (12,10))
plt.subplot(221)
g1 = sns.distplot(df['Watch times(Minutes)_log'])
g1.set_title("Watch times(Minutes) LOG DISTRIBUITION", fontsize=8)

plt.subplot(222)
g1 = sns.distplot(df['Average viewers_log'])
g1.set_title("Average viewers LOG DISTRIBUITION", fontsize=8)

plt.subplot(223)
g1 = sns.distplot(df['Peak viewers_log'])
g1.set_title("Peak viewers LOG DISTRIBUITION", fontsize=8)

plt.subplot(224)
g1 = sns.distplot(df['Followers gained_log'])
g1.set_title("Followers gained LOG DISTRIBUITION", fontsize=8)

In [None]:
plt.subplots(figsize=(12,8))
wordcloud = WordCloud(
                          background_color='white',
                          width=1920,
                          height=1280
                         ).generate(" ".join(df.Language))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
print("Language count")
print(df.Language.value_counts()[:21])


plt.figure(figsize = (14,15))

plt.subplot(311)
g = sns.countplot('Language', data=df, palette="Set1")
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Counting the Video Language ", fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel("Count", fontsize=12)

plt.subplot(312)
g1 = sns.boxplot(x='Language', y='Watch times(Minutes)_log', data=df, palette="Set2")
g1.set_xticklabels(g.get_xticklabels(),rotation=45)
g1.set_title("Views Distribuition by Language", fontsize=20)
g1.set_xlabel("", fontsize=15)
g1.set_ylabel("Watch times(Minutes)(log)", fontsize=15)

plt.subplot(313)
g1 = sns.boxplot(x='Language', y='Followers gained_log', data=df, palette="Set3")
g1.set_xticklabels(g.get_xticklabels(),rotation=45)
g1.set_title("Followers Distribuition by Language", fontsize=20)
g1.set_xlabel("", fontsize=15)
g1.set_ylabel("Followers gained(log)", fontsize=15)

plt.subplots_adjust(hspace = 0.7, top = 0.9)

plt.show()

In [None]:
polish = df.loc[df['Language'] == "Polish"]
plt.figure(figsize = (20,8))
french.head(25).plot.bar(x='Channel', y='Average viewers',color = 'blue',  align='edge')
plt.title('Comparing the average viewers for the first 25 French streamers')
plt.xlabel('Streamers')
plt.ylabel('Count') 
plt.plot()

In [None]:
df['QualityOutput'] =  df['Watch time(Minutes)'] / df['Stream time(minutes)'] * 100
df['Trend_Streamer'] =  df['Peak viewers'] / df['Average viewers'] * 100000
plt.figure(figsize = (20,12))

g1 = sns.distplot(df['QualityOutput'], color='red',hist=False, label="Watch")

g1 = sns.distplot(df['Trend_Streamer'], color='green',hist=False, label="Viewer")
g1.set_title('CONVERT RATE DISTRIBUITION', fontsize=16)

plt.legend()
plt.show()

In [None]:
plt.grid()
plt.plot(df["Watch time(Minutes)"]/max(df["Watch time(Minutes)"]))
plt.plot(df["Stream time(minutes)"]/max(df["Stream time(minutes)"]), "y+")
plt.title('Comparing watch time and stream time (normalized)')
plt.xlabel("Streamer's rank")
plt.ylabel('Normalized value')
plt.legend(["Normalized watch time", "Normalized Stream time"])
plt.show()

In [None]:
slope, intercept, r_value, p_value, std_err = sp.stats.linregress (range(0,1000), df["Stream time(minutes)"]/max(df["Stream time(minutes)"]))
def predict(x):
   return slope * x + intercept
line = predict(range(0,1000))
plt.plot(line)
plt.plot(df["Watch time(Minutes)"]/max(df["Watch time(Minutes)"]))
plt.title('Comparing watch time and stream time (regression)')

plt.xlabel("Streamer's rank")
plt.ylabel('Normalized value')
plt.legend(["Normalized watch time", "Normalized Stream time (regression)"])
plt.grid()
plt.show()
print("Correlation: {}".format(r_value))

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df[['Channel', 'Watch time(Minutes)', 'Stream time(minutes)', 'Followers','Peak viewers','Average viewers','Followers gained','Views gained','Partnered','Mature','Language']].corr(), annot = True)
plt.title('Overall relation between columns of the Dataset', fontsize = 20)
plt.show()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
mean_follower = np.mean(df.iloc[:,5])
max_follower = np.max(df.iloc[:,5])
mean_stream = np.mean(df.iloc[:,3])
max_stream = np.max(df.iloc[:,3])
popularity = pd.cut(df.iloc[:,5],bins=[0,mean_follower,max_follower],labels=['less popular','popular'])
stream = pd.cut(df.iloc[:,3],bins=[0,mean_stream,max_stream],labels=['less frequent','frequent'])
df.insert(6,'level_of_popularity',popularity )
df.insert(3,'frequency_of_streaming',stream)

In [None]:
lb_make = LabelEncoder()
df["level_of_popularity"] = lb_make.fit_transform(df["level_of_popularity"])
df["Language_code"] = lb_make.fit_transform(df["Language"])
df["Partnered"] = lb_make.fit_transform(df["Partnered"])
df["Mature"] = lb_make.fit_transform(df["Mature"])
df["frequency_of_streaming"] = lb_make.fit_transform(df["frequency_of_streaming"])

In [None]:
#classification
y = df['level_of_popularity']
X = pd.concat([df['frequency_of_streaming'],df['Partnered'],df['Mature'],df['Language_code']],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)


In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

classifier = RandomForestClassifier(n_estimators=100)
reg = classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
#Print the results 
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

In [None]:
import statsmodels.formula.api as smf
df['Language'] = np.where(df['Language'] != 'English', 'Others', df['Language'])
model = smf.ols(formula='Followers~C(Language)+C(frequency_of_streaming)+C(Mature)', data=df)
res = model.fit()
print(res.summary())