# Project ATIT - Sentiment Analysis with LinearSVC

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

### Dataset Preperation



In [16]:
# Make data directory if it doesn't exist
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip


In [17]:
import pandas as pd

df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [18]:
# Change polarity to 0 and 1
df.polarity.value_counts()

0    800000
4    800000
Name: polarity, dtype: int64

In [19]:
df.polarity = df.polarity.replace({0: 0, 4: 1})
df.polarity.value_counts()

0    800000
1    800000
Name: polarity, dtype: int64

In [20]:
df = df.drop(columns=['id', 'date', 'query', 'user'])
df.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [21]:
df = df.sample(n=500000)
df.polarity.value_counts()

1    250027
0    249973
Name: polarity, dtype: int64

## Working with data


**Data Preperation**

In [22]:
df

Unnamed: 0,polarity,text
489763,0,@cibber Darn Kay Mellor beat me to it for Fat ...
482446,0,@Miss_Tricky thank you i wish i had my girls ...
297875,0,Can't figure out what is wrong with my poor li...
865674,1,@normsutaria are you a professor at LIU? Curre...
451764,0,@iamjersey Wish I could. Ity's on the long wee...
...,...,...
1068573,1,@jaycrafty fashofasho will be at kimbois caus...
414256,0,"Ok, I've been home since 7. but Stacy has been..."
1497930,1,"@FontSiteDiva We is stoked too, Ms Diva! This ..."
516942,0,reeeally want to download the hush hush PCD so...


In [23]:
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
words_df.head()



Unnamed: 0,10,100,11,12,15,1st,20,2day,2nd,30,...,yesterday,yet,yo,you,your,yours,yourself,youtube,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.213631,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.390218,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
X = words_df
y = df.polarity

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [26]:
svc = LinearSVC()
svc.fit(X_train, y_train)

LinearSVC()

In [27]:
y_true = y_test
y_pred = svc.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['negative', 'positive'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted negative,Predicted positive
Is negative,37387,12700
Is positive,10793,39120


In [28]:
y_true = y_test
y_pred = svc.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['negative', 'positive'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names).div(matrix.sum(axis=1), axis=0)

Unnamed: 0,Predicted negative,Predicted positive
Is negative,0.746441,0.253559
Is positive,0.216236,0.783764


#**Predict our own datasets filled with tweets**

In [39]:
df_2 = pd.read_csv('https://raw.githubusercontent.com/IvanWasNotAvailable/StockPricePrediction/main/tweets.csv',names=['text'], )

In [45]:
unknown_vectors = vectorizer.transform(df_2.text)
unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names())
unknown_words_df.head()



Unnamed: 0,10,100,11,12,15,1st,20,2day,2nd,30,...,yesterday,yet,yo,you,your,yours,yourself,youtube,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df_2['pred_svc'] = svc.predict(unknown_words_df)
df_2

Unnamed: 0,text,pred_svc
,text,1
0.0,"Today, on International Civil Aviation Day, we...",1
1.0,How can chemical production processes become m...,0
2.0,Inditex and BASF have developed a detergent th...,0
3.0,BASF to restore polluted Superfund site in New...,1
4.0,“The damage done can never be wholly undone. N...,0
5.0,The DEP announced a proposed voluntarily Natur...,1
6.0,BASF to restore polluted Superfund site in New...,1
7.0,"BASF, the largest chemical producer in the wor...",1
8.0,"For the past 30 years, Lori Goucher, has slowl...",1
