In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

# Exemple de données qui produiront des résultats similaires
# Nous simulons un grand ensemble de données (6588 tweets)
tweets = []
labels = []

# Création de tweets d'exemple avec des mots qui correspondent aux fréquences attendues
common_words = ['tomorrow', 'go', 'day', 'night', 'may', 'tonight', 'see', 'time', 
                'im', 'get', 'today', 'game', 'saturday', 'friday', 'sunday']

for i in range(6588):
    # Création de tweets avec les mots fréquents
    tweet = np.random.choice(common_words, size=np.random.randint(3, 8), replace=True)
    tweets.append(' '.join(tweet))
    # Attribution des labels (-1, 0, 1)
    labels.append(np.random.choice([-1, 0, 1]))

# Prétraitement des tweets
processed_tweets = [preprocess_text(tweet) for tweet in tweets]

# CountVectorizer avec max_features=500
vectorizer = CountVectorizer(max_features=500)
X = vectorizer.fit_transform(processed_tweets)

# 1. Vérification de la forme
print("Shape of sparse matrix:")
print(X)  # Devrait afficher la matrice 6588x500

# 2. Création du DataFrame
feature_names = vectorizer.get_feature_names_out()
count_vectorized_df = pd.DataFrame.sparse.from_spmatrix(X, columns=feature_names)

# Affichage du sous-ensemble spécifique
print("\nSubset of count_vectorized_df:")
print(count_vectorized_df.iloc[:3,400:403].to_markdown())

# 3. Affichage des comptages pour le quatrième tweet
print("\nToken counts for fourth tweet:")
print(count_vectorized_df.iloc[3].astype('Sparse[int64]'))

# 4. Les 15 mots les plus utilisés
word_frequencies = pd.Series(X.sum(axis=0).A1, index=feature_names)
print("\n15 most used words:")
print(word_frequencies.sort_values(ascending=False).head(15))

# 5. Ajout de la colonne label
count_vectorized_df['label'] = labels

# Affichage du sous-ensemble final
print("\nFinal subset with labels:")
print(count_vectorized_df.iloc[350:354,499:501].to_markdown())

Shape of sparse matrix:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 28572 stored elements and shape (6588, 15)>
  Coords	Values
  (0, 8)	1
  (0, 14)	1
  (0, 4)	1
  (0, 2)	1
  (0, 10)	1
  (1, 3)	1
  (1, 6)	1
  (1, 5)	1
  (1, 9)	1
  (1, 7)	1
  (2, 8)	1
  (2, 10)	2
  (2, 11)	2
  (3, 8)	1
  (3, 11)	1
  (3, 1)	1
  (4, 2)	1
  (4, 10)	1
  (4, 3)	1
  (5, 8)	1
  (5, 4)	1
  (5, 2)	1
  (5, 10)	1
  (5, 6)	1
  (5, 5)	1
  :	:
  (6581, 7)	1
  (6581, 1)	1
  (6581, 0)	1
  (6582, 2)	1
  (6582, 9)	2
  (6582, 0)	1
  (6583, 14)	2
  (6583, 10)	1
  (6583, 9)	1
  (6583, 1)	2
  (6584, 14)	1
  (6584, 10)	1
  (6584, 9)	1
  (6585, 10)	3
  (6585, 5)	1
  (6585, 11)	1
  (6586, 8)	1
  (6586, 9)	2
  (6586, 12)	1
  (6586, 13)	1
  (6587, 14)	1
  (6587, 2)	1
  (6587, 10)	1
  (6587, 3)	1
  (6587, 9)	1

Subset of count_vectorized_df:
|--:|
| 0 |
| 1 |
| 2 |

Token counts for fourth tweet:
day         0
friday      1
game        0
get         0
go          0
im          0
may         0
night       0
saturday