# Data Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd


In [3]:
dataset = pd.read_csv('/content/drive/Shareddrives/도터디/캡스톤 디자인/data/fin_data.csv')
del dataset['Unnamed: 0']

In [4]:
len(dataset['jpg_count'])

1941

# Train Test Set

In [5]:
X = dataset[['image_count', 'jpg_count', 'video_count', 'price', 'price_goal',
       'facebook_supporter_count', 'new_info_count', 'supporter_count',
       'like_count', 'newness', 'price_avg', 'product_count',
       'maker_respon_time', 'funding_duration', 'title_length',
       'summary_length', 'content_length', 'encore', 'image_percent',
       'jpg_percent', 'video_percent', 'price_power', 'market_saturation',
       'topic1','topic2', 'topic3']]
y = dataset['success']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 정규화

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **다중공선성 및 상관계수**

In [8]:
from statsmodels.formula.api import ols


In [9]:
model = ols('success ~ image_count + jpg_count + video_count + price + price_goal + facebook_supporter_count + new_info_count + supporter_count + like_count + newness + price_avg + product_count + maker_respon_time + funding_duration + title_length + summary_length + content_length + encore + image_percent + jpg_percent + video_percent + price_power + market_saturation + topic1 + topic2 + topic3', dataset)


In [10]:
res = model.fit()


In [11]:
res.summary()

0,1,2,3
Dep. Variable:,success,R-squared:,0.321
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,34.87
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,1.16e-140
Time:,00:17:52,Log-Likelihood:,-128.5
No. Observations:,1941,AIC:,311.0
Df Residuals:,1914,BIC:,461.4
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0524,0.073,-0.719,0.472,-0.195,0.091
image_count,-0.0003,0.000,-0.992,0.321,-0.001,0.000
jpg_count,0.0032,0.001,5.455,0.000,0.002,0.004
video_count,-0.0171,0.006,-2.747,0.006,-0.029,-0.005
price,2.116e-08,2.54e-08,0.832,0.405,-2.87e-08,7.1e-08
price_goal,-3.344e-09,7.45e-10,-4.487,0.000,-4.81e-09,-1.88e-09
facebook_supporter_count,0.0016,0.001,2.253,0.024,0.000,0.003
new_info_count,5.272e-05,9.16e-06,5.753,0.000,3.47e-05,7.07e-05
supporter_count,5.013e-05,1.29e-05,3.893,0.000,2.49e-05,7.54e-05

0,1,2,3
Omnibus:,574.393,Durbin-Watson:,2.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1983.798
Skew:,1.446,Prob(JB):,0.0
Kurtosis:,7.02,Cond. No.,426000000.0


다중공선성 확인

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [13]:
model.exog_names


['Intercept',
 'image_count',
 'jpg_count',
 'video_count',
 'price',
 'price_goal',
 'facebook_supporter_count',
 'new_info_count',
 'supporter_count',
 'like_count',
 'newness',
 'price_avg',
 'product_count',
 'maker_respon_time',
 'funding_duration',
 'title_length',
 'summary_length',
 'content_length',
 'encore',
 'image_percent',
 'jpg_percent',
 'video_percent',
 'price_power',
 'market_saturation',
 'topic1',
 'topic2',
 'topic3']

In [14]:
pd.DataFrame({'컬럼': column, 'VIF': variance_inflation_factor(model.exog, i)} 
             for i, column in enumerate(model.exog_names)
             if column != 'Intercept')  # 절편의 VIF는 구하지 않는다.

Unnamed: 0,컬럼,VIF
0,image_count,1.492532
1,jpg_count,1.32004
2,video_count,1.085461
3,price,1.413845
4,price_goal,1.069009
5,facebook_supporter_count,1.027665
6,new_info_count,1.289104
7,supporter_count,3.473317
8,like_count,3.845115
9,newness,1.066507


In [15]:
#pd.DataFrame({'컬럼': column, 'VIF': variance_inflation_factor(model.exog, i)} 
#             for i, column in enumerate(model.exog_names)
#             if column != 'Intercept').to_csv('/content/drive/Shareddrives/도터디/캡스톤 디자인/data/vif.csv')

In [16]:
feature = ['image_count', 'jpg_count', 'video_count', 'price', 'price_goal',
       'facebook_supporter_count', 'new_info_count', 'supporter_count',
       'like_count', 'newness', 'price_avg', 'product_count',
       'maker_respon_time', 'funding_duration', 'title_length',
       'summary_length', 'content_length', 'encore', 'image_percent',
       'jpg_percent', 'video_percent', 'price_power', 'market_saturation',
       'topic1', 'topic2', 'topic3']

In [17]:
def model_coef(model):
  model_df = pd.DataFrame()
  coef = []
  for i in model:
    for j in i:
       coef.append(j)

  model_df['X'] = feature
  model_df['coef'] = coef

  return model_df.sort_values('coef',ascending =False)

# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
LI = LinearRegression()
LI.fit(X_train, y_train)

LinearRegression()

In [19]:
print('train_set : ',LI.score(X_train, y_train))
print('test_set : ',LI.score(X_test, y_test))

train_set :  0.33256471312370695
test_set :  0.24846121144288036


In [20]:
LI_df = pd.DataFrame()
linearcoef = []
for i in LI.coef_:
  linearcoef.append(i)
LI_df['X'] = feature
LI_df['coef'] = linearcoef
LI_df.sort_values('coef',ascending =False)


Unnamed: 0,X,coef
7,supporter_count,0.083406
8,like_count,0.070471
1,jpg_count,0.039385
6,new_info_count,0.030365
14,title_length,0.023737
10,price_avg,0.019229
5,facebook_supporter_count,0.013601
19,jpg_percent,0.011648
17,encore,0.011264
3,price,0.009761


# Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

LogisticRegression()

In [22]:
print('train_set : ',LR.score(X_train, y_train))
print('test_set : ',LR.score(X_test, y_test))

train_set :  0.940893470790378
test_set :  0.9197530864197531


In [23]:
import statsmodels.api as sm

logis = sm.Logit.from_formula('success ~ image_count + jpg_count + video_count + price + price_goal + facebook_supporter_count + new_info_count + supporter_count + like_count + newness + price_avg + product_count + maker_respon_time + funding_duration + title_length + summary_length + content_length + encore + image_percent + jpg_percent + video_percent + price_power + market_saturation + topic1 + topic2 + topic3', dataset).fit()

logis.summary()

Optimization terminated successfully.
         Current function value: 0.137824
         Iterations 11


0,1,2,3
Dep. Variable:,success,No. Observations:,1941.0
Model:,Logit,Df Residuals:,1914.0
Method:,MLE,Df Model:,26.0
Date:,"Wed, 26 Oct 2022",Pseudo R-squ.:,0.6041
Time:,00:17:54,Log-Likelihood:,-267.52
converged:,True,LL-Null:,-675.7
Covariance Type:,nonrobust,LLR p-value:,2.468e-155

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-6.1860,1.713,-3.611,0.000,-9.544,-2.828
image_count,0.0065,0.006,1.061,0.289,-0.006,0.019
jpg_count,0.0419,0.010,4.357,0.000,0.023,0.061
video_count,-0.2057,0.131,-1.571,0.116,-0.462,0.051
price,2.43e-06,5.5e-07,4.418,0.000,1.35e-06,3.51e-06
price_goal,-1.945e-06,1.93e-07,-10.087,0.000,-2.32e-06,-1.57e-06
facebook_supporter_count,0.0060,0.006,0.953,0.340,-0.006,0.018
new_info_count,0.0005,0.000,3.605,0.000,0.000,0.001
supporter_count,0.0013,0.000,4.292,0.000,0.001,0.002


Logistic Regression 상관계수

In [24]:
model_coef(LR.coef_)

Unnamed: 0,X,coef
8,like_count,1.265697
7,supporter_count,0.887143
10,price_avg,0.494841
1,jpg_count,0.425863
6,new_info_count,0.409562
14,title_length,0.405624
3,price,0.280336
17,encore,0.21309
0,image_count,0.096956
11,product_count,0.088282


# Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [26]:
print('train_set : ',DT.score(X_train, y_train))
print('test_set : ',DT.score(X_test, y_test))

train_set :  1.0
test_set :  0.8806584362139918


DecisionTree 상관계수

# SVM

In [27]:
import sklearn.svm as svm
