In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [None]:
tweets = pd.read_csv("/content/train.csv")
tweets.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27480 entries, 0 to 27479
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


**Drop "neutral" values from the dataset**




**What is the percentage of positive/negative tweets?**

In [None]:
tweets = tweets[tweets['sentiment']!='neutral']

tweets['sentiment'].value_counts(normalize=True)*100

positive    52.447595
negative    47.552405
Name: sentiment, dtype: float64

**Apply a train test split**

In [None]:
X = tweets['text']
y = tweets['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32)

**Create a vectorizer model with scikit-learn using the TfidfVectorizer method.**

In [None]:
tfidf = TfidfVectorizer()

# Train the model on X_train
X_train_CV = tfidf.fit_transform(X_train)

# Create the X_test_CV matrix without re-training the model
X_test_CV = tfidf.transform(X_test)

X_train_CV

<12272x15806 sparse matrix of type '<class 'numpy.float64'>'
	with 144578 stored elements in Compressed Sparse Row format>

**Train a logistic regression**

In [None]:
# Here we train the model only on the train dataset.
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train_CV, y_train)


# And now we compare both scores :
print("\nScore for the Train dataset :", model.score(X_train_CV, y_train))
print("Score for the Test dataset :", model.score(X_test_CV, y_test))


Score for the Train dataset : 0.9321219035202086
Score for the Test dataset : 0.8731361525299438


**Matrix**

In [None]:
pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model.predict(X_test_CV)),
             index = model.classes_ + " actual",
             columns = model.classes_ + " predicted")

Unnamed: 0,negative predicted,positive predicted
negative actual,1692,243
positive actual,276,1880


**Display 10 tweets that were badly predicted (false positive or false negative)**

In [None]:
predictions = model.predict(X_test_CV)

# Get the indices of the incorrectly predicted tweets
y_testArray = y_test.to_numpy()
X_testArray = X_test.to_numpy()

incorrect_indices = [i for i in range(len(y_test)) if y_testArray[i] != predictions[i]]

# Print the tweets that were incorrectly predicted
for i in incorrect_indices[:10]:
  print(f'Tweet: {X_testArray[i]}')
  print(f'Prediction: {predictions[i]}')
  print(f'Actual: {y_testArray[i]}')
  print()

Tweet: HollowbabesHere comes the utter shite #bgt <I completely agree
Prediction: positive
Actual: negative

Tweet:  SUFFICATION NO BREATHING. It`s okay. There`ll be more. You`re invited to mine, but I can`t promise fun times.  *Jinx
Prediction: positive
Actual: negative

Tweet: i wanna vote for Miley Cyrus for the mtv movie awards..but i don`t know where i could  somebody could send me a link? thaank you <3
Prediction: negative
Actual: positive

Tweet: I love music so much that i`ve gone through pain to play :S my sides of my fingers now are peeling and have blisters from playing so much
Prediction: positive
Actual: negative

Tweet: I can only message those who message me, if we`re fwends...so those that want replies..follow me.  hmm..that sounds funny..
Prediction: negative
Actual: positive

Tweet: wish I could feel no pain (8)  but it`s ok, at least they like Brazil!
Prediction: negative
Actual: positive

Tweet:  so glad i`m not at uni anymore
Prediction: negative
Actual: positive

